matmulavx128_c8.c revision 1.1.1.1
1/* Implementation of the MATMUL intrinsic
2   Copyright (C) 2002-2019 Free Software Foundation, Inc.
3   Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
4
5This file is part of the GNU Fortran runtime library (libgfortran).
6
7Libgfortran is free software; you can redistribute it and/or
8modify it under the terms of the GNU General Public
9License as published by the Free Software Foundation; either
10version 3 of the License, or (at your option) any later version.
11
12Libgfortran is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15GNU General Public License for more details.
16
17Under Section 7 of GPL version 3, you are granted additional
18permissions described in the GCC Runtime Library Exception, version
193.1, as published by the Free Software Foundation.
20
21You should have received a copy of the GNU General Public License and
22a copy of the GCC Runtime Library Exception along with this program;
23see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24<http://www.gnu.org/licenses/>.  */
25
26#include "libgfortran.h"
27#include <string.h>
28#include <assert.h>
29
30
31/* These are the specific versions of matmul with -mprefer-avx128.  */
32
33#if defined (HAVE_GFC_COMPLEX_8)
34
35/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
36   passed to us by the front-end, in which case we call it for large
37   matrices.  */
38
39typedef void (*blas_call)(const char *, const char *, const int *, const int *,
40                          const int *, const GFC_COMPLEX_8 *, const GFC_COMPLEX_8 *,
41                          const int *, const GFC_COMPLEX_8 *, const int *,
42                          const GFC_COMPLEX_8 *, GFC_COMPLEX_8 *, const int *,
43                          int, int);
44
45#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
46void
47matmul_c8_avx128_fma3 (gfc_array_c8 * const restrict retarray,
48	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
49	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
50internal_proto(matmul_c8_avx128_fma3);
51void
52matmul_c8_avx128_fma3 (gfc_array_c8 * const restrict retarray,
53	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
54	int blas_limit, blas_call gemm)
55{
56  const GFC_COMPLEX_8 * restrict abase;
57  const GFC_COMPLEX_8 * restrict bbase;
58  GFC_COMPLEX_8 * restrict dest;
59
60  index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
61  index_type x, y, n, count, xcount, ycount;
62
63  assert (GFC_DESCRIPTOR_RANK (a) == 2
64          || GFC_DESCRIPTOR_RANK (b) == 2);
65
66/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
67
68   Either A or B (but not both) can be rank 1:
69
70   o One-dimensional argument A is implicitly treated as a row matrix
71     dimensioned [1,count], so xcount=1.
72
73   o One-dimensional argument B is implicitly treated as a column matrix
74     dimensioned [count, 1], so ycount=1.
75*/
76
77  if (retarray->base_addr == NULL)
78    {
79      if (GFC_DESCRIPTOR_RANK (a) == 1)
80        {
81	  GFC_DIMENSION_SET(retarray->dim[0], 0,
82	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
83        }
84      else if (GFC_DESCRIPTOR_RANK (b) == 1)
85        {
86	  GFC_DIMENSION_SET(retarray->dim[0], 0,
87	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
88        }
89      else
90        {
91	  GFC_DIMENSION_SET(retarray->dim[0], 0,
92	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
93
94          GFC_DIMENSION_SET(retarray->dim[1], 0,
95	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
96			    GFC_DESCRIPTOR_EXTENT(retarray,0));
97        }
98
99      retarray->base_addr
100	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
101      retarray->offset = 0;
102    }
103  else if (unlikely (compile_options.bounds_check))
104    {
105      index_type ret_extent, arg_extent;
106
107      if (GFC_DESCRIPTOR_RANK (a) == 1)
108	{
109	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
110	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
111	  if (arg_extent != ret_extent)
112	    runtime_error ("Array bound mismatch for dimension 1 of "
113	    		   "array (%ld/%ld) ",
114			   (long int) ret_extent, (long int) arg_extent);
115	}
116      else if (GFC_DESCRIPTOR_RANK (b) == 1)
117	{
118	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
119	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
120	  if (arg_extent != ret_extent)
121	    runtime_error ("Array bound mismatch for dimension 1 of "
122	    		   "array (%ld/%ld) ",
123			   (long int) ret_extent, (long int) arg_extent);
124	}
125      else
126	{
127	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
128	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
129	  if (arg_extent != ret_extent)
130	    runtime_error ("Array bound mismatch for dimension 1 of "
131	    		   "array (%ld/%ld) ",
132			   (long int) ret_extent, (long int) arg_extent);
133
134	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
135	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
136	  if (arg_extent != ret_extent)
137	    runtime_error ("Array bound mismatch for dimension 2 of "
138	    		   "array (%ld/%ld) ",
139			   (long int) ret_extent, (long int) arg_extent);
140	}
141    }
142
143
144  if (GFC_DESCRIPTOR_RANK (retarray) == 1)
145    {
146      /* One-dimensional result may be addressed in the code below
147	 either as a row or a column matrix. We want both cases to
148	 work. */
149      rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
150    }
151  else
152    {
153      rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
154      rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
155    }
156
157
158  if (GFC_DESCRIPTOR_RANK (a) == 1)
159    {
160      /* Treat it as a a row matrix A[1,count]. */
161      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
162      aystride = 1;
163
164      xcount = 1;
165      count = GFC_DESCRIPTOR_EXTENT(a,0);
166    }
167  else
168    {
169      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
170      aystride = GFC_DESCRIPTOR_STRIDE(a,1);
171
172      count = GFC_DESCRIPTOR_EXTENT(a,1);
173      xcount = GFC_DESCRIPTOR_EXTENT(a,0);
174    }
175
176  if (count != GFC_DESCRIPTOR_EXTENT(b,0))
177    {
178      if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
179	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
180		       "in dimension 1: is %ld, should be %ld",
181		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
182    }
183
184  if (GFC_DESCRIPTOR_RANK (b) == 1)
185    {
186      /* Treat it as a column matrix B[count,1] */
187      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
188
189      /* bystride should never be used for 1-dimensional b.
190         The value is only used for calculation of the
191         memory by the buffer.  */
192      bystride = 256;
193      ycount = 1;
194    }
195  else
196    {
197      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
198      bystride = GFC_DESCRIPTOR_STRIDE(b,1);
199      ycount = GFC_DESCRIPTOR_EXTENT(b,1);
200    }
201
202  abase = a->base_addr;
203  bbase = b->base_addr;
204  dest = retarray->base_addr;
205
206  /* Now that everything is set up, we perform the multiplication
207     itself.  */
208
209#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
210#define min(a,b) ((a) <= (b) ? (a) : (b))
211#define max(a,b) ((a) >= (b) ? (a) : (b))
212
213  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
214      && (bxstride == 1 || bystride == 1)
215      && (((float) xcount) * ((float) ycount) * ((float) count)
216          > POW3(blas_limit)))
217    {
218      const int m = xcount, n = ycount, k = count, ldc = rystride;
219      const GFC_COMPLEX_8 one = 1, zero = 0;
220      const int lda = (axstride == 1) ? aystride : axstride,
221		ldb = (bxstride == 1) ? bystride : bxstride;
222
223      if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
224	{
225	  assert (gemm != NULL);
226	  const char *transa, *transb;
227	  if (try_blas & 2)
228	    transa = "C";
229	  else
230	    transa = axstride == 1 ? "N" : "T";
231
232	  if (try_blas & 4)
233	    transb = "C";
234	  else
235	    transb = bxstride == 1 ? "N" : "T";
236
237	  gemm (transa, transb , &m,
238		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
239		&ldc, 1, 1);
240	  return;
241	}
242    }
243
244  if (rxstride == 1 && axstride == 1 && bxstride == 1)
245    {
246      /* This block of code implements a tuned matmul, derived from
247         Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
248
249               Bo Kagstrom and Per Ling
250               Department of Computing Science
251               Umea University
252               S-901 87 Umea, Sweden
253
254	 from netlib.org, translated to C, and modified for matmul.m4.  */
255
256      const GFC_COMPLEX_8 *a, *b;
257      GFC_COMPLEX_8 *c;
258      const index_type m = xcount, n = ycount, k = count;
259
260      /* System generated locals */
261      index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
262		 i1, i2, i3, i4, i5, i6;
263
264      /* Local variables */
265      GFC_COMPLEX_8 f11, f12, f21, f22, f31, f32, f41, f42,
266		 f13, f14, f23, f24, f33, f34, f43, f44;
267      index_type i, j, l, ii, jj, ll;
268      index_type isec, jsec, lsec, uisec, ujsec, ulsec;
269      GFC_COMPLEX_8 *t1;
270
271      a = abase;
272      b = bbase;
273      c = retarray->base_addr;
274
275      /* Parameter adjustments */
276      c_dim1 = rystride;
277      c_offset = 1 + c_dim1;
278      c -= c_offset;
279      a_dim1 = aystride;
280      a_offset = 1 + a_dim1;
281      a -= a_offset;
282      b_dim1 = bystride;
283      b_offset = 1 + b_dim1;
284      b -= b_offset;
285
286      /* Empty c first.  */
287      for (j=1; j<=n; j++)
288	for (i=1; i<=m; i++)
289	  c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
290
291      /* Early exit if possible */
292      if (m == 0 || n == 0 || k == 0)
293	return;
294
295      /* Adjust size of t1 to what is needed.  */
296      index_type t1_dim, a_sz;
297      if (aystride == 1)
298        a_sz = rystride;
299      else
300        a_sz = a_dim1;
301
302      t1_dim = a_sz * 256 + b_dim1;
303      if (t1_dim > 65536)
304	t1_dim = 65536;
305
306      t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_8));
307
308      /* Start turning the crank. */
309      i1 = n;
310      for (jj = 1; jj <= i1; jj += 512)
311	{
312	  /* Computing MIN */
313	  i2 = 512;
314	  i3 = n - jj + 1;
315	  jsec = min(i2,i3);
316	  ujsec = jsec - jsec % 4;
317	  i2 = k;
318	  for (ll = 1; ll <= i2; ll += 256)
319	    {
320	      /* Computing MIN */
321	      i3 = 256;
322	      i4 = k - ll + 1;
323	      lsec = min(i3,i4);
324	      ulsec = lsec - lsec % 2;
325
326	      i3 = m;
327	      for (ii = 1; ii <= i3; ii += 256)
328		{
329		  /* Computing MIN */
330		  i4 = 256;
331		  i5 = m - ii + 1;
332		  isec = min(i4,i5);
333		  uisec = isec - isec % 2;
334		  i4 = ll + ulsec - 1;
335		  for (l = ll; l <= i4; l += 2)
336		    {
337		      i5 = ii + uisec - 1;
338		      for (i = ii; i <= i5; i += 2)
339			{
340			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
341					a[i + l * a_dim1];
342			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
343					a[i + (l + 1) * a_dim1];
344			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
345					a[i + 1 + l * a_dim1];
346			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
347					a[i + 1 + (l + 1) * a_dim1];
348			}
349		      if (uisec < isec)
350			{
351			  t1[l - ll + 1 + (isec << 8) - 257] =
352				    a[ii + isec - 1 + l * a_dim1];
353			  t1[l - ll + 2 + (isec << 8) - 257] =
354				    a[ii + isec - 1 + (l + 1) * a_dim1];
355			}
356		    }
357		  if (ulsec < lsec)
358		    {
359		      i4 = ii + isec - 1;
360		      for (i = ii; i<= i4; ++i)
361			{
362			  t1[lsec + ((i - ii + 1) << 8) - 257] =
363				    a[i + (ll + lsec - 1) * a_dim1];
364			}
365		    }
366
367		  uisec = isec - isec % 4;
368		  i4 = jj + ujsec - 1;
369		  for (j = jj; j <= i4; j += 4)
370		    {
371		      i5 = ii + uisec - 1;
372		      for (i = ii; i <= i5; i += 4)
373			{
374			  f11 = c[i + j * c_dim1];
375			  f21 = c[i + 1 + j * c_dim1];
376			  f12 = c[i + (j + 1) * c_dim1];
377			  f22 = c[i + 1 + (j + 1) * c_dim1];
378			  f13 = c[i + (j + 2) * c_dim1];
379			  f23 = c[i + 1 + (j + 2) * c_dim1];
380			  f14 = c[i + (j + 3) * c_dim1];
381			  f24 = c[i + 1 + (j + 3) * c_dim1];
382			  f31 = c[i + 2 + j * c_dim1];
383			  f41 = c[i + 3 + j * c_dim1];
384			  f32 = c[i + 2 + (j + 1) * c_dim1];
385			  f42 = c[i + 3 + (j + 1) * c_dim1];
386			  f33 = c[i + 2 + (j + 2) * c_dim1];
387			  f43 = c[i + 3 + (j + 2) * c_dim1];
388			  f34 = c[i + 2 + (j + 3) * c_dim1];
389			  f44 = c[i + 3 + (j + 3) * c_dim1];
390			  i6 = ll + lsec - 1;
391			  for (l = ll; l <= i6; ++l)
392			    {
393			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
394				      * b[l + j * b_dim1];
395			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
396				      * b[l + j * b_dim1];
397			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
398				      * b[l + (j + 1) * b_dim1];
399			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
400				      * b[l + (j + 1) * b_dim1];
401			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
402				      * b[l + (j + 2) * b_dim1];
403			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
404				      * b[l + (j + 2) * b_dim1];
405			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
406				      * b[l + (j + 3) * b_dim1];
407			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
408				      * b[l + (j + 3) * b_dim1];
409			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
410				      * b[l + j * b_dim1];
411			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
412				      * b[l + j * b_dim1];
413			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
414				      * b[l + (j + 1) * b_dim1];
415			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
416				      * b[l + (j + 1) * b_dim1];
417			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
418				      * b[l + (j + 2) * b_dim1];
419			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
420				      * b[l + (j + 2) * b_dim1];
421			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
422				      * b[l + (j + 3) * b_dim1];
423			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
424				      * b[l + (j + 3) * b_dim1];
425			    }
426			  c[i + j * c_dim1] = f11;
427			  c[i + 1 + j * c_dim1] = f21;
428			  c[i + (j + 1) * c_dim1] = f12;
429			  c[i + 1 + (j + 1) * c_dim1] = f22;
430			  c[i + (j + 2) * c_dim1] = f13;
431			  c[i + 1 + (j + 2) * c_dim1] = f23;
432			  c[i + (j + 3) * c_dim1] = f14;
433			  c[i + 1 + (j + 3) * c_dim1] = f24;
434			  c[i + 2 + j * c_dim1] = f31;
435			  c[i + 3 + j * c_dim1] = f41;
436			  c[i + 2 + (j + 1) * c_dim1] = f32;
437			  c[i + 3 + (j + 1) * c_dim1] = f42;
438			  c[i + 2 + (j + 2) * c_dim1] = f33;
439			  c[i + 3 + (j + 2) * c_dim1] = f43;
440			  c[i + 2 + (j + 3) * c_dim1] = f34;
441			  c[i + 3 + (j + 3) * c_dim1] = f44;
442			}
443		      if (uisec < isec)
444			{
445			  i5 = ii + isec - 1;
446			  for (i = ii + uisec; i <= i5; ++i)
447			    {
448			      f11 = c[i + j * c_dim1];
449			      f12 = c[i + (j + 1) * c_dim1];
450			      f13 = c[i + (j + 2) * c_dim1];
451			      f14 = c[i + (j + 3) * c_dim1];
452			      i6 = ll + lsec - 1;
453			      for (l = ll; l <= i6; ++l)
454				{
455				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
456					  257] * b[l + j * b_dim1];
457				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
458					  257] * b[l + (j + 1) * b_dim1];
459				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
460					  257] * b[l + (j + 2) * b_dim1];
461				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
462					  257] * b[l + (j + 3) * b_dim1];
463				}
464			      c[i + j * c_dim1] = f11;
465			      c[i + (j + 1) * c_dim1] = f12;
466			      c[i + (j + 2) * c_dim1] = f13;
467			      c[i + (j + 3) * c_dim1] = f14;
468			    }
469			}
470		    }
471		  if (ujsec < jsec)
472		    {
473		      i4 = jj + jsec - 1;
474		      for (j = jj + ujsec; j <= i4; ++j)
475			{
476			  i5 = ii + uisec - 1;
477			  for (i = ii; i <= i5; i += 4)
478			    {
479			      f11 = c[i + j * c_dim1];
480			      f21 = c[i + 1 + j * c_dim1];
481			      f31 = c[i + 2 + j * c_dim1];
482			      f41 = c[i + 3 + j * c_dim1];
483			      i6 = ll + lsec - 1;
484			      for (l = ll; l <= i6; ++l)
485				{
486				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
487					  257] * b[l + j * b_dim1];
488				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
489					  257] * b[l + j * b_dim1];
490				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
491					  257] * b[l + j * b_dim1];
492				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
493					  257] * b[l + j * b_dim1];
494				}
495			      c[i + j * c_dim1] = f11;
496			      c[i + 1 + j * c_dim1] = f21;
497			      c[i + 2 + j * c_dim1] = f31;
498			      c[i + 3 + j * c_dim1] = f41;
499			    }
500			  i5 = ii + isec - 1;
501			  for (i = ii + uisec; i <= i5; ++i)
502			    {
503			      f11 = c[i + j * c_dim1];
504			      i6 = ll + lsec - 1;
505			      for (l = ll; l <= i6; ++l)
506				{
507				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
508					  257] * b[l + j * b_dim1];
509				}
510			      c[i + j * c_dim1] = f11;
511			    }
512			}
513		    }
514		}
515	    }
516	}
517      free(t1);
518      return;
519    }
520  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
521    {
522      if (GFC_DESCRIPTOR_RANK (a) != 1)
523	{
524	  const GFC_COMPLEX_8 *restrict abase_x;
525	  const GFC_COMPLEX_8 *restrict bbase_y;
526	  GFC_COMPLEX_8 *restrict dest_y;
527	  GFC_COMPLEX_8 s;
528
529	  for (y = 0; y < ycount; y++)
530	    {
531	      bbase_y = &bbase[y*bystride];
532	      dest_y = &dest[y*rystride];
533	      for (x = 0; x < xcount; x++)
534		{
535		  abase_x = &abase[x*axstride];
536		  s = (GFC_COMPLEX_8) 0;
537		  for (n = 0; n < count; n++)
538		    s += abase_x[n] * bbase_y[n];
539		  dest_y[x] = s;
540		}
541	    }
542	}
543      else
544	{
545	  const GFC_COMPLEX_8 *restrict bbase_y;
546	  GFC_COMPLEX_8 s;
547
548	  for (y = 0; y < ycount; y++)
549	    {
550	      bbase_y = &bbase[y*bystride];
551	      s = (GFC_COMPLEX_8) 0;
552	      for (n = 0; n < count; n++)
553		s += abase[n*axstride] * bbase_y[n];
554	      dest[y*rystride] = s;
555	    }
556	}
557    }
558  else if (axstride < aystride)
559    {
560      for (y = 0; y < ycount; y++)
561	for (x = 0; x < xcount; x++)
562	  dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
563
564      for (y = 0; y < ycount; y++)
565	for (n = 0; n < count; n++)
566	  for (x = 0; x < xcount; x++)
567	    /* dest[x,y] += a[x,n] * b[n,y] */
568	    dest[x*rxstride + y*rystride] +=
569					abase[x*axstride + n*aystride] *
570					bbase[n*bxstride + y*bystride];
571    }
572  else if (GFC_DESCRIPTOR_RANK (a) == 1)
573    {
574      const GFC_COMPLEX_8 *restrict bbase_y;
575      GFC_COMPLEX_8 s;
576
577      for (y = 0; y < ycount; y++)
578	{
579	  bbase_y = &bbase[y*bystride];
580	  s = (GFC_COMPLEX_8) 0;
581	  for (n = 0; n < count; n++)
582	    s += abase[n*axstride] * bbase_y[n*bxstride];
583	  dest[y*rxstride] = s;
584	}
585    }
586  else
587    {
588      const GFC_COMPLEX_8 *restrict abase_x;
589      const GFC_COMPLEX_8 *restrict bbase_y;
590      GFC_COMPLEX_8 *restrict dest_y;
591      GFC_COMPLEX_8 s;
592
593      for (y = 0; y < ycount; y++)
594	{
595	  bbase_y = &bbase[y*bystride];
596	  dest_y = &dest[y*rystride];
597	  for (x = 0; x < xcount; x++)
598	    {
599	      abase_x = &abase[x*axstride];
600	      s = (GFC_COMPLEX_8) 0;
601	      for (n = 0; n < count; n++)
602		s += abase_x[n*aystride] * bbase_y[n*bxstride];
603	      dest_y[x*rxstride] = s;
604	    }
605	}
606    }
607}
608#undef POW3
609#undef min
610#undef max
611
612#endif
613
614#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
615void
616matmul_c8_avx128_fma4 (gfc_array_c8 * const restrict retarray,
617	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
618	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
619internal_proto(matmul_c8_avx128_fma4);
620void
621matmul_c8_avx128_fma4 (gfc_array_c8 * const restrict retarray,
622	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
623	int blas_limit, blas_call gemm)
624{
625  const GFC_COMPLEX_8 * restrict abase;
626  const GFC_COMPLEX_8 * restrict bbase;
627  GFC_COMPLEX_8 * restrict dest;
628
629  index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
630  index_type x, y, n, count, xcount, ycount;
631
632  assert (GFC_DESCRIPTOR_RANK (a) == 2
633          || GFC_DESCRIPTOR_RANK (b) == 2);
634
635/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
636
637   Either A or B (but not both) can be rank 1:
638
639   o One-dimensional argument A is implicitly treated as a row matrix
640     dimensioned [1,count], so xcount=1.
641
642   o One-dimensional argument B is implicitly treated as a column matrix
643     dimensioned [count, 1], so ycount=1.
644*/
645
646  if (retarray->base_addr == NULL)
647    {
648      if (GFC_DESCRIPTOR_RANK (a) == 1)
649        {
650	  GFC_DIMENSION_SET(retarray->dim[0], 0,
651	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
652        }
653      else if (GFC_DESCRIPTOR_RANK (b) == 1)
654        {
655	  GFC_DIMENSION_SET(retarray->dim[0], 0,
656	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
657        }
658      else
659        {
660	  GFC_DIMENSION_SET(retarray->dim[0], 0,
661	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
662
663          GFC_DIMENSION_SET(retarray->dim[1], 0,
664	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
665			    GFC_DESCRIPTOR_EXTENT(retarray,0));
666        }
667
668      retarray->base_addr
669	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
670      retarray->offset = 0;
671    }
672  else if (unlikely (compile_options.bounds_check))
673    {
674      index_type ret_extent, arg_extent;
675
676      if (GFC_DESCRIPTOR_RANK (a) == 1)
677	{
678	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
679	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
680	  if (arg_extent != ret_extent)
681	    runtime_error ("Array bound mismatch for dimension 1 of "
682	    		   "array (%ld/%ld) ",
683			   (long int) ret_extent, (long int) arg_extent);
684	}
685      else if (GFC_DESCRIPTOR_RANK (b) == 1)
686	{
687	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
688	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
689	  if (arg_extent != ret_extent)
690	    runtime_error ("Array bound mismatch for dimension 1 of "
691	    		   "array (%ld/%ld) ",
692			   (long int) ret_extent, (long int) arg_extent);
693	}
694      else
695	{
696	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
697	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
698	  if (arg_extent != ret_extent)
699	    runtime_error ("Array bound mismatch for dimension 1 of "
700	    		   "array (%ld/%ld) ",
701			   (long int) ret_extent, (long int) arg_extent);
702
703	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
704	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
705	  if (arg_extent != ret_extent)
706	    runtime_error ("Array bound mismatch for dimension 2 of "
707	    		   "array (%ld/%ld) ",
708			   (long int) ret_extent, (long int) arg_extent);
709	}
710    }
711
712
713  if (GFC_DESCRIPTOR_RANK (retarray) == 1)
714    {
715      /* One-dimensional result may be addressed in the code below
716	 either as a row or a column matrix. We want both cases to
717	 work. */
718      rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
719    }
720  else
721    {
722      rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
723      rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
724    }
725
726
727  if (GFC_DESCRIPTOR_RANK (a) == 1)
728    {
729      /* Treat it as a a row matrix A[1,count]. */
730      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
731      aystride = 1;
732
733      xcount = 1;
734      count = GFC_DESCRIPTOR_EXTENT(a,0);
735    }
736  else
737    {
738      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
739      aystride = GFC_DESCRIPTOR_STRIDE(a,1);
740
741      count = GFC_DESCRIPTOR_EXTENT(a,1);
742      xcount = GFC_DESCRIPTOR_EXTENT(a,0);
743    }
744
745  if (count != GFC_DESCRIPTOR_EXTENT(b,0))
746    {
747      if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
748	runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
749		       "in dimension 1: is %ld, should be %ld",
750		       (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
751    }
752
753  if (GFC_DESCRIPTOR_RANK (b) == 1)
754    {
755      /* Treat it as a column matrix B[count,1] */
756      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
757
758      /* bystride should never be used for 1-dimensional b.
759         The value is only used for calculation of the
760         memory by the buffer.  */
761      bystride = 256;
762      ycount = 1;
763    }
764  else
765    {
766      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
767      bystride = GFC_DESCRIPTOR_STRIDE(b,1);
768      ycount = GFC_DESCRIPTOR_EXTENT(b,1);
769    }
770
771  abase = a->base_addr;
772  bbase = b->base_addr;
773  dest = retarray->base_addr;
774
775  /* Now that everything is set up, we perform the multiplication
776     itself.  */
777
778#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
779#define min(a,b) ((a) <= (b) ? (a) : (b))
780#define max(a,b) ((a) >= (b) ? (a) : (b))
781
782  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
783      && (bxstride == 1 || bystride == 1)
784      && (((float) xcount) * ((float) ycount) * ((float) count)
785          > POW3(blas_limit)))
786    {
787      const int m = xcount, n = ycount, k = count, ldc = rystride;
788      const GFC_COMPLEX_8 one = 1, zero = 0;
789      const int lda = (axstride == 1) ? aystride : axstride,
790		ldb = (bxstride == 1) ? bystride : bxstride;
791
792      if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
793	{
794	  assert (gemm != NULL);
795	  const char *transa, *transb;
796	  if (try_blas & 2)
797	    transa = "C";
798	  else
799	    transa = axstride == 1 ? "N" : "T";
800
801	  if (try_blas & 4)
802	    transb = "C";
803	  else
804	    transb = bxstride == 1 ? "N" : "T";
805
806	  gemm (transa, transb , &m,
807		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
808		&ldc, 1, 1);
809	  return;
810	}
811    }
812
813  if (rxstride == 1 && axstride == 1 && bxstride == 1)
814    {
815      /* This block of code implements a tuned matmul, derived from
816         Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
817
818               Bo Kagstrom and Per Ling
819               Department of Computing Science
820               Umea University
821               S-901 87 Umea, Sweden
822
823	 from netlib.org, translated to C, and modified for matmul.m4.  */
824
825      const GFC_COMPLEX_8 *a, *b;
826      GFC_COMPLEX_8 *c;
827      const index_type m = xcount, n = ycount, k = count;
828
829      /* System generated locals */
830      index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
831		 i1, i2, i3, i4, i5, i6;
832
833      /* Local variables */
834      GFC_COMPLEX_8 f11, f12, f21, f22, f31, f32, f41, f42,
835		 f13, f14, f23, f24, f33, f34, f43, f44;
836      index_type i, j, l, ii, jj, ll;
837      index_type isec, jsec, lsec, uisec, ujsec, ulsec;
838      GFC_COMPLEX_8 *t1;
839
840      a = abase;
841      b = bbase;
842      c = retarray->base_addr;
843
844      /* Parameter adjustments */
845      c_dim1 = rystride;
846      c_offset = 1 + c_dim1;
847      c -= c_offset;
848      a_dim1 = aystride;
849      a_offset = 1 + a_dim1;
850      a -= a_offset;
851      b_dim1 = bystride;
852      b_offset = 1 + b_dim1;
853      b -= b_offset;
854
855      /* Empty c first.  */
856      for (j=1; j<=n; j++)
857	for (i=1; i<=m; i++)
858	  c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
859
860      /* Early exit if possible */
861      if (m == 0 || n == 0 || k == 0)
862	return;
863
864      /* Adjust size of t1 to what is needed.  */
865      index_type t1_dim, a_sz;
866      if (aystride == 1)
867        a_sz = rystride;
868      else
869        a_sz = a_dim1;
870
871      t1_dim = a_sz * 256 + b_dim1;
872      if (t1_dim > 65536)
873	t1_dim = 65536;
874
875      t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_8));
876
877      /* Start turning the crank. */
878      i1 = n;
879      for (jj = 1; jj <= i1; jj += 512)
880	{
881	  /* Computing MIN */
882	  i2 = 512;
883	  i3 = n - jj + 1;
884	  jsec = min(i2,i3);
885	  ujsec = jsec - jsec % 4;
886	  i2 = k;
887	  for (ll = 1; ll <= i2; ll += 256)
888	    {
889	      /* Computing MIN */
890	      i3 = 256;
891	      i4 = k - ll + 1;
892	      lsec = min(i3,i4);
893	      ulsec = lsec - lsec % 2;
894
895	      i3 = m;
896	      for (ii = 1; ii <= i3; ii += 256)
897		{
898		  /* Computing MIN */
899		  i4 = 256;
900		  i5 = m - ii + 1;
901		  isec = min(i4,i5);
902		  uisec = isec - isec % 2;
903		  i4 = ll + ulsec - 1;
904		  for (l = ll; l <= i4; l += 2)
905		    {
906		      i5 = ii + uisec - 1;
907		      for (i = ii; i <= i5; i += 2)
908			{
909			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
910					a[i + l * a_dim1];
911			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
912					a[i + (l + 1) * a_dim1];
913			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
914					a[i + 1 + l * a_dim1];
915			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
916					a[i + 1 + (l + 1) * a_dim1];
917			}
918		      if (uisec < isec)
919			{
920			  t1[l - ll + 1 + (isec << 8) - 257] =
921				    a[ii + isec - 1 + l * a_dim1];
922			  t1[l - ll + 2 + (isec << 8) - 257] =
923				    a[ii + isec - 1 + (l + 1) * a_dim1];
924			}
925		    }
926		  if (ulsec < lsec)
927		    {
928		      i4 = ii + isec - 1;
929		      for (i = ii; i<= i4; ++i)
930			{
931			  t1[lsec + ((i - ii + 1) << 8) - 257] =
932				    a[i + (ll + lsec - 1) * a_dim1];
933			}
934		    }
935
936		  uisec = isec - isec % 4;
937		  i4 = jj + ujsec - 1;
938		  for (j = jj; j <= i4; j += 4)
939		    {
940		      i5 = ii + uisec - 1;
941		      for (i = ii; i <= i5; i += 4)
942			{
943			  f11 = c[i + j * c_dim1];
944			  f21 = c[i + 1 + j * c_dim1];
945			  f12 = c[i + (j + 1) * c_dim1];
946			  f22 = c[i + 1 + (j + 1) * c_dim1];
947			  f13 = c[i + (j + 2) * c_dim1];
948			  f23 = c[i + 1 + (j + 2) * c_dim1];
949			  f14 = c[i + (j + 3) * c_dim1];
950			  f24 = c[i + 1 + (j + 3) * c_dim1];
951			  f31 = c[i + 2 + j * c_dim1];
952			  f41 = c[i + 3 + j * c_dim1];
953			  f32 = c[i + 2 + (j + 1) * c_dim1];
954			  f42 = c[i + 3 + (j + 1) * c_dim1];
955			  f33 = c[i + 2 + (j + 2) * c_dim1];
956			  f43 = c[i + 3 + (j + 2) * c_dim1];
957			  f34 = c[i + 2 + (j + 3) * c_dim1];
958			  f44 = c[i + 3 + (j + 3) * c_dim1];
959			  i6 = ll + lsec - 1;
960			  for (l = ll; l <= i6; ++l)
961			    {
962			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
963				      * b[l + j * b_dim1];
964			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
965				      * b[l + j * b_dim1];
966			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
967				      * b[l + (j + 1) * b_dim1];
968			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
969				      * b[l + (j + 1) * b_dim1];
970			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
971				      * b[l + (j + 2) * b_dim1];
972			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
973				      * b[l + (j + 2) * b_dim1];
974			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
975				      * b[l + (j + 3) * b_dim1];
976			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
977				      * b[l + (j + 3) * b_dim1];
978			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
979				      * b[l + j * b_dim1];
980			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
981				      * b[l + j * b_dim1];
982			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
983				      * b[l + (j + 1) * b_dim1];
984			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
985				      * b[l + (j + 1) * b_dim1];
986			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
987				      * b[l + (j + 2) * b_dim1];
988			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
989				      * b[l + (j + 2) * b_dim1];
990			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
991				      * b[l + (j + 3) * b_dim1];
992			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
993				      * b[l + (j + 3) * b_dim1];
994			    }
995			  c[i + j * c_dim1] = f11;
996			  c[i + 1 + j * c_dim1] = f21;
997			  c[i + (j + 1) * c_dim1] = f12;
998			  c[i + 1 + (j + 1) * c_dim1] = f22;
999			  c[i + (j + 2) * c_dim1] = f13;
1000			  c[i + 1 + (j + 2) * c_dim1] = f23;
1001			  c[i + (j + 3) * c_dim1] = f14;
1002			  c[i + 1 + (j + 3) * c_dim1] = f24;
1003			  c[i + 2 + j * c_dim1] = f31;
1004			  c[i + 3 + j * c_dim1] = f41;
1005			  c[i + 2 + (j + 1) * c_dim1] = f32;
1006			  c[i + 3 + (j + 1) * c_dim1] = f42;
1007			  c[i + 2 + (j + 2) * c_dim1] = f33;
1008			  c[i + 3 + (j + 2) * c_dim1] = f43;
1009			  c[i + 2 + (j + 3) * c_dim1] = f34;
1010			  c[i + 3 + (j + 3) * c_dim1] = f44;
1011			}
1012		      if (uisec < isec)
1013			{
1014			  i5 = ii + isec - 1;
1015			  for (i = ii + uisec; i <= i5; ++i)
1016			    {
1017			      f11 = c[i + j * c_dim1];
1018			      f12 = c[i + (j + 1) * c_dim1];
1019			      f13 = c[i + (j + 2) * c_dim1];
1020			      f14 = c[i + (j + 3) * c_dim1];
1021			      i6 = ll + lsec - 1;
1022			      for (l = ll; l <= i6; ++l)
1023				{
1024				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1025					  257] * b[l + j * b_dim1];
1026				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1027					  257] * b[l + (j + 1) * b_dim1];
1028				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1029					  257] * b[l + (j + 2) * b_dim1];
1030				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1031					  257] * b[l + (j + 3) * b_dim1];
1032				}
1033			      c[i + j * c_dim1] = f11;
1034			      c[i + (j + 1) * c_dim1] = f12;
1035			      c[i + (j + 2) * c_dim1] = f13;
1036			      c[i + (j + 3) * c_dim1] = f14;
1037			    }
1038			}
1039		    }
1040		  if (ujsec < jsec)
1041		    {
1042		      i4 = jj + jsec - 1;
1043		      for (j = jj + ujsec; j <= i4; ++j)
1044			{
1045			  i5 = ii + uisec - 1;
1046			  for (i = ii; i <= i5; i += 4)
1047			    {
1048			      f11 = c[i + j * c_dim1];
1049			      f21 = c[i + 1 + j * c_dim1];
1050			      f31 = c[i + 2 + j * c_dim1];
1051			      f41 = c[i + 3 + j * c_dim1];
1052			      i6 = ll + lsec - 1;
1053			      for (l = ll; l <= i6; ++l)
1054				{
1055				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1056					  257] * b[l + j * b_dim1];
1057				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1058					  257] * b[l + j * b_dim1];
1059				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1060					  257] * b[l + j * b_dim1];
1061				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1062					  257] * b[l + j * b_dim1];
1063				}
1064			      c[i + j * c_dim1] = f11;
1065			      c[i + 1 + j * c_dim1] = f21;
1066			      c[i + 2 + j * c_dim1] = f31;
1067			      c[i + 3 + j * c_dim1] = f41;
1068			    }
1069			  i5 = ii + isec - 1;
1070			  for (i = ii + uisec; i <= i5; ++i)
1071			    {
1072			      f11 = c[i + j * c_dim1];
1073			      i6 = ll + lsec - 1;
1074			      for (l = ll; l <= i6; ++l)
1075				{
1076				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1077					  257] * b[l + j * b_dim1];
1078				}
1079			      c[i + j * c_dim1] = f11;
1080			    }
1081			}
1082		    }
1083		}
1084	    }
1085	}
1086      free(t1);
1087      return;
1088    }
1089  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1090    {
1091      if (GFC_DESCRIPTOR_RANK (a) != 1)
1092	{
1093	  const GFC_COMPLEX_8 *restrict abase_x;
1094	  const GFC_COMPLEX_8 *restrict bbase_y;
1095	  GFC_COMPLEX_8 *restrict dest_y;
1096	  GFC_COMPLEX_8 s;
1097
1098	  for (y = 0; y < ycount; y++)
1099	    {
1100	      bbase_y = &bbase[y*bystride];
1101	      dest_y = &dest[y*rystride];
1102	      for (x = 0; x < xcount; x++)
1103		{
1104		  abase_x = &abase[x*axstride];
1105		  s = (GFC_COMPLEX_8) 0;
1106		  for (n = 0; n < count; n++)
1107		    s += abase_x[n] * bbase_y[n];
1108		  dest_y[x] = s;
1109		}
1110	    }
1111	}
1112      else
1113	{
1114	  const GFC_COMPLEX_8 *restrict bbase_y;
1115	  GFC_COMPLEX_8 s;
1116
1117	  for (y = 0; y < ycount; y++)
1118	    {
1119	      bbase_y = &bbase[y*bystride];
1120	      s = (GFC_COMPLEX_8) 0;
1121	      for (n = 0; n < count; n++)
1122		s += abase[n*axstride] * bbase_y[n];
1123	      dest[y*rystride] = s;
1124	    }
1125	}
1126    }
1127  else if (axstride < aystride)
1128    {
1129      for (y = 0; y < ycount; y++)
1130	for (x = 0; x < xcount; x++)
1131	  dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
1132
1133      for (y = 0; y < ycount; y++)
1134	for (n = 0; n < count; n++)
1135	  for (x = 0; x < xcount; x++)
1136	    /* dest[x,y] += a[x,n] * b[n,y] */
1137	    dest[x*rxstride + y*rystride] +=
1138					abase[x*axstride + n*aystride] *
1139					bbase[n*bxstride + y*bystride];
1140    }
1141  else if (GFC_DESCRIPTOR_RANK (a) == 1)
1142    {
1143      const GFC_COMPLEX_8 *restrict bbase_y;
1144      GFC_COMPLEX_8 s;
1145
1146      for (y = 0; y < ycount; y++)
1147	{
1148	  bbase_y = &bbase[y*bystride];
1149	  s = (GFC_COMPLEX_8) 0;
1150	  for (n = 0; n < count; n++)
1151	    s += abase[n*axstride] * bbase_y[n*bxstride];
1152	  dest[y*rxstride] = s;
1153	}
1154    }
1155  else
1156    {
1157      const GFC_COMPLEX_8 *restrict abase_x;
1158      const GFC_COMPLEX_8 *restrict bbase_y;
1159      GFC_COMPLEX_8 *restrict dest_y;
1160      GFC_COMPLEX_8 s;
1161
1162      for (y = 0; y < ycount; y++)
1163	{
1164	  bbase_y = &bbase[y*bystride];
1165	  dest_y = &dest[y*rystride];
1166	  for (x = 0; x < xcount; x++)
1167	    {
1168	      abase_x = &abase[x*axstride];
1169	      s = (GFC_COMPLEX_8) 0;
1170	      for (n = 0; n < count; n++)
1171		s += abase_x[n*aystride] * bbase_y[n*bxstride];
1172	      dest_y[x*rxstride] = s;
1173	    }
1174	}
1175    }
1176}
1177#undef POW3
1178#undef min
1179#undef max
1180
1181#endif
1182
1183#endif
1184
1185