matmul.m4 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. `/* Implementation of the MATMUL intrinsic
  2. Copyright (C) 2002-2022 Free Software Foundation, Inc.
  3. Contributed by Paul Brook <paul@nowt.org>
  4. This file is part of the GNU Fortran runtime library (libgfortran).
  5. Libgfortran is free software; you can redistribute it and/or
  6. modify it under the terms of the GNU General Public
  7. License as published by the Free Software Foundation; either
  8. version 3 of the License, or (at your option) any later version.
  9. Libgfortran is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. Under Section 7 of GPL version 3, you are granted additional
  14. permissions described in the GCC Runtime Library Exception, version
  15. 3.1, as published by the Free Software Foundation.
  16. You should have received a copy of the GNU General Public License and
  17. a copy of the GCC Runtime Library Exception along with this program;
  18. see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
  19. <http://www.gnu.org/licenses/>. */
  20. #include "libgfortran.h"
  21. #include <string.h>
  22. #include <assert.h>'
  23. include(iparm.m4)dnl
  24. `#if defined (HAVE_'rtype_name`)
  25. /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
  26. passed to us by the front-end, in which case we call it for large
  27. matrices. */
  28. typedef void (*blas_call)(const char *, const char *, const int *, const int *,
  29. const int *, const 'rtype_name` *, const 'rtype_name` *,
  30. const int *, const 'rtype_name` *, const int *,
  31. const 'rtype_name` *, 'rtype_name` *, const int *,
  32. int, int);
  33. /* The order of loops is different in the case of plain matrix
  34. multiplication C=MATMUL(A,B), and in the frequent special case where
  35. the argument A is the temporary result of a TRANSPOSE intrinsic:
  36. C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
  37. looking at their strides.
  38. The equivalent Fortran pseudo-code is:
  39. DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
  40. IF (.NOT.IS_TRANSPOSED(A)) THEN
  41. C = 0
  42. DO J=1,N
  43. DO K=1,COUNT
  44. DO I=1,M
  45. C(I,J) = C(I,J)+A(I,K)*B(K,J)
  46. ELSE
  47. DO J=1,N
  48. DO I=1,M
  49. S = 0
  50. DO K=1,COUNT
  51. S = S+A(I,K)*B(K,J)
  52. C(I,J) = S
  53. ENDIF
  54. */
  55. /* If try_blas is set to a nonzero value, then the matmul function will
  56. see if there is a way to perform the matrix multiplication by a call
  57. to the BLAS gemm function. */
  58. extern void matmul_'rtype_code` ('rtype` * const restrict retarray,
  59. 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
  60. int blas_limit, blas_call gemm);
  61. export_proto(matmul_'rtype_code`);
  62. /* Put exhaustive list of possible architectures here here, ORed together. */
  63. #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
  64. #ifdef HAVE_AVX
  65. 'define(`matmul_name',`matmul_'rtype_code`_avx')dnl
  66. `static void
  67. 'matmul_name` ('rtype` * const restrict retarray,
  68. 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
  69. int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
  70. static' include(matmul_internal.m4)dnl
  71. `#endif /* HAVE_AVX */
  72. #ifdef HAVE_AVX2
  73. 'define(`matmul_name',`matmul_'rtype_code`_avx2')dnl
  74. `static void
  75. 'matmul_name` ('rtype` * const restrict retarray,
  76. 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
  77. int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
  78. static' include(matmul_internal.m4)dnl
  79. `#endif /* HAVE_AVX2 */
  80. #ifdef HAVE_AVX512F
  81. 'define(`matmul_name',`matmul_'rtype_code`_avx512f')dnl
  82. `static void
  83. 'matmul_name` ('rtype` * const restrict retarray,
  84. 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
  85. int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
  86. static' include(matmul_internal.m4)dnl
  87. `#endif /* HAVE_AVX512F */
  88. /* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
  89. #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
  90. 'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
  91. `void
  92. 'matmul_name` ('rtype` * const restrict retarray,
  93. 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
  94. int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
  95. internal_proto('matmul_name`);
  96. #endif
  97. #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
  98. 'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
  99. `void
  100. 'matmul_name` ('rtype` * const restrict retarray,
  101. 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
  102. int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
  103. internal_proto('matmul_name`);
  104. #endif
  105. /* Function to fall back to if there is no special processor-specific version. */
  106. 'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl
  107. `static' include(matmul_internal.m4)dnl
  108. `/* Compiling main function, with selection code for the processor. */
  109. /* Currently, this is i386 only. Adjust for other architectures. */
  110. void matmul_'rtype_code` ('rtype` * const restrict retarray,
  111. 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
  112. int blas_limit, blas_call gemm)
  113. {
  114. static void (*matmul_p) ('rtype` * const restrict retarray,
  115. 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
  116. int blas_limit, blas_call gemm);
  117. void (*matmul_fn) ('rtype` * const restrict retarray,
  118. 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
  119. int blas_limit, blas_call gemm);
  120. matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
  121. if (matmul_fn == NULL)
  122. {
  123. matmul_fn = matmul_'rtype_code`_vanilla;
  124. if (__builtin_cpu_is ("intel"))
  125. {
  126. /* Run down the available processors in order of preference. */
  127. #ifdef HAVE_AVX512F
  128. if (__builtin_cpu_supports ("avx512f"))
  129. {
  130. matmul_fn = matmul_'rtype_code`_avx512f;
  131. goto store;
  132. }
  133. #endif /* HAVE_AVX512F */
  134. #ifdef HAVE_AVX2
  135. if (__builtin_cpu_supports ("avx2")
  136. && __builtin_cpu_supports ("fma"))
  137. {
  138. matmul_fn = matmul_'rtype_code`_avx2;
  139. goto store;
  140. }
  141. #endif
  142. #ifdef HAVE_AVX
  143. if (__builtin_cpu_supports ("avx"))
  144. {
  145. matmul_fn = matmul_'rtype_code`_avx;
  146. goto store;
  147. }
  148. #endif /* HAVE_AVX */
  149. }
  150. else if (__builtin_cpu_is ("amd"))
  151. {
  152. #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
  153. if (__builtin_cpu_supports ("avx")
  154. && __builtin_cpu_supports ("fma"))
  155. {
  156. matmul_fn = matmul_'rtype_code`_avx128_fma3;
  157. goto store;
  158. }
  159. #endif
  160. #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
  161. if (__builtin_cpu_supports ("avx")
  162. && __builtin_cpu_supports ("fma4"))
  163. {
  164. matmul_fn = matmul_'rtype_code`_avx128_fma4;
  165. goto store;
  166. }
  167. #endif
  168. }
  169. store:
  170. __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
  171. }
  172. (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
  173. }
  174. #else /* Just the vanilla function. */
  175. 'define(`matmul_name',`matmul_'rtype_code)dnl
  176. define(`target_attribute',`')dnl
  177. include(matmul_internal.m4)dnl
  178. `#endif
  179. #endif
  180. '