iter_ull.c 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. /* Copyright (C) 2005-2022 Free Software Foundation, Inc.
  2. Contributed by Richard Henderson <rth@redhat.com>.
  3. This file is part of the GNU Offloading and Multi Processing Library
  4. (libgomp).
  5. Libgomp is free software; you can redistribute it and/or modify it
  6. under the terms of the GNU General Public License as published by
  7. the Free Software Foundation; either version 3, or (at your option)
  8. any later version.
  9. Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  10. WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11. FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  12. more details.
  13. Under Section 7 of GPL version 3, you are granted additional
  14. permissions described in the GCC Runtime Library Exception, version
  15. 3.1, as published by the Free Software Foundation.
  16. You should have received a copy of the GNU General Public License and
  17. a copy of the GCC Runtime Library Exception along with this program;
  18. see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
  19. <http://www.gnu.org/licenses/>. */
  20. /* This file contains routines for managing work-share iteration, both
  21. for loops and sections. */
  22. #include "libgomp.h"
  23. #include <stdlib.h>
  24. typedef unsigned long long gomp_ull;
  25. /* This function implements the STATIC scheduling method. The caller should
  26. iterate *pstart <= x < *pend. Return zero if there are more iterations
  27. to perform; nonzero if not. Return less than 0 if this thread had
  28. received the absolutely last iteration. */
  29. int
  30. gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
  31. {
  32. struct gomp_thread *thr = gomp_thread ();
  33. struct gomp_team *team = thr->ts.team;
  34. struct gomp_work_share *ws = thr->ts.work_share;
  35. unsigned long nthreads = team ? team->nthreads : 1;
  36. if (thr->ts.static_trip == -1)
  37. return -1;
  38. /* Quick test for degenerate teams and orphaned constructs. */
  39. if (nthreads == 1)
  40. {
  41. *pstart = ws->next_ull;
  42. *pend = ws->end_ull;
  43. thr->ts.static_trip = -1;
  44. return ws->next_ull == ws->end_ull;
  45. }
  46. /* We interpret chunk_size zero as "unspecified", which means that we
  47. should break up the iterations such that each thread makes only one
  48. trip through the outer loop. */
  49. if (ws->chunk_size_ull == 0)
  50. {
  51. gomp_ull n, q, i, t, s0, e0, s, e;
  52. if (thr->ts.static_trip > 0)
  53. return 1;
  54. /* Compute the total number of iterations. */
  55. if (__builtin_expect (ws->mode, 0) == 0)
  56. n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
  57. else
  58. n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
  59. i = thr->ts.team_id;
  60. /* Compute the "zero-based" start and end points. That is, as
  61. if the loop began at zero and incremented by one. */
  62. q = n / nthreads;
  63. t = n % nthreads;
  64. if (i < t)
  65. {
  66. t = 0;
  67. q++;
  68. }
  69. s0 = q * i + t;
  70. e0 = s0 + q;
  71. /* Notice when no iterations allocated for this thread. */
  72. if (s0 >= e0)
  73. {
  74. thr->ts.static_trip = 1;
  75. return 1;
  76. }
  77. /* Transform these to the actual start and end numbers. */
  78. s = s0 * ws->incr_ull + ws->next_ull;
  79. e = e0 * ws->incr_ull + ws->next_ull;
  80. *pstart = s;
  81. *pend = e;
  82. thr->ts.static_trip = (e0 == n ? -1 : 1);
  83. return 0;
  84. }
  85. else
  86. {
  87. gomp_ull n, s0, e0, i, c, s, e;
  88. /* Otherwise, each thread gets exactly chunk_size iterations
  89. (if available) each time through the loop. */
  90. if (__builtin_expect (ws->mode, 0) == 0)
  91. n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
  92. else
  93. n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
  94. i = thr->ts.team_id;
  95. c = ws->chunk_size_ull;
  96. /* Initial guess is a C sized chunk positioned nthreads iterations
  97. in, offset by our thread number. */
  98. s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
  99. e0 = s0 + c;
  100. /* Detect overflow. */
  101. if (s0 >= n)
  102. return 1;
  103. if (e0 > n)
  104. e0 = n;
  105. /* Transform these to the actual start and end numbers. */
  106. s = s0 * ws->incr_ull + ws->next_ull;
  107. e = e0 * ws->incr_ull + ws->next_ull;
  108. *pstart = s;
  109. *pend = e;
  110. if (e0 == n)
  111. thr->ts.static_trip = -1;
  112. else
  113. thr->ts.static_trip++;
  114. return 0;
  115. }
  116. }
  117. /* This function implements the DYNAMIC scheduling method. Arguments are
  118. as for gomp_iter_ull_static_next. This function must be called with
  119. ws->lock held. */
  120. bool
  121. gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
  122. {
  123. struct gomp_thread *thr = gomp_thread ();
  124. struct gomp_work_share *ws = thr->ts.work_share;
  125. gomp_ull start, end, chunk, left;
  126. start = ws->next_ull;
  127. if (start == ws->end_ull)
  128. return false;
  129. chunk = ws->chunk_size_ull;
  130. left = ws->end_ull - start;
  131. if (__builtin_expect (ws->mode & 2, 0))
  132. {
  133. if (chunk < left)
  134. chunk = left;
  135. }
  136. else
  137. {
  138. if (chunk > left)
  139. chunk = left;
  140. }
  141. end = start + chunk;
  142. ws->next_ull = end;
  143. *pstart = start;
  144. *pend = end;
  145. return true;
  146. }
  147. #if defined HAVE_SYNC_BUILTINS && defined __LP64__
  148. /* Similar, but doesn't require the lock held, and uses compare-and-swap
  149. instead. Note that the only memory value that changes is ws->next_ull. */
  150. bool
  151. gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
  152. {
  153. struct gomp_thread *thr = gomp_thread ();
  154. struct gomp_work_share *ws = thr->ts.work_share;
  155. gomp_ull start, end, nend, chunk;
  156. end = ws->end_ull;
  157. chunk = ws->chunk_size_ull;
  158. if (__builtin_expect (ws->mode & 1, 1))
  159. {
  160. gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
  161. if (__builtin_expect (ws->mode & 2, 0) == 0)
  162. {
  163. if (tmp >= end)
  164. return false;
  165. nend = tmp + chunk;
  166. if (nend > end)
  167. nend = end;
  168. *pstart = tmp;
  169. *pend = nend;
  170. return true;
  171. }
  172. else
  173. {
  174. if (tmp <= end)
  175. return false;
  176. nend = tmp + chunk;
  177. if (nend < end)
  178. nend = end;
  179. *pstart = tmp;
  180. *pend = nend;
  181. return true;
  182. }
  183. }
  184. start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
  185. while (1)
  186. {
  187. gomp_ull left = end - start;
  188. gomp_ull tmp;
  189. if (start == end)
  190. return false;
  191. if (__builtin_expect (ws->mode & 2, 0))
  192. {
  193. if (chunk < left)
  194. chunk = left;
  195. }
  196. else
  197. {
  198. if (chunk > left)
  199. chunk = left;
  200. }
  201. nend = start + chunk;
  202. tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
  203. if (__builtin_expect (tmp == start, 1))
  204. break;
  205. start = tmp;
  206. }
  207. *pstart = start;
  208. *pend = nend;
  209. return true;
  210. }
  211. #endif /* HAVE_SYNC_BUILTINS */
  212. /* This function implements the GUIDED scheduling method. Arguments are
  213. as for gomp_iter_ull_static_next. This function must be called with the
  214. work share lock held. */
  215. bool
  216. gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
  217. {
  218. struct gomp_thread *thr = gomp_thread ();
  219. struct gomp_work_share *ws = thr->ts.work_share;
  220. struct gomp_team *team = thr->ts.team;
  221. gomp_ull nthreads = team ? team->nthreads : 1;
  222. gomp_ull n, q;
  223. gomp_ull start, end;
  224. if (ws->next_ull == ws->end_ull)
  225. return false;
  226. start = ws->next_ull;
  227. if (__builtin_expect (ws->mode, 0) == 0)
  228. n = (ws->end_ull - start) / ws->incr_ull;
  229. else
  230. n = (start - ws->end_ull) / -ws->incr_ull;
  231. q = (n + nthreads - 1) / nthreads;
  232. if (q < ws->chunk_size_ull)
  233. q = ws->chunk_size_ull;
  234. if (q <= n)
  235. end = start + q * ws->incr_ull;
  236. else
  237. end = ws->end_ull;
  238. ws->next_ull = end;
  239. *pstart = start;
  240. *pend = end;
  241. return true;
  242. }
  243. #if defined HAVE_SYNC_BUILTINS && defined __LP64__
  244. /* Similar, but doesn't require the lock held, and uses compare-and-swap
  245. instead. Note that the only memory value that changes is ws->next_ull. */
  246. bool
  247. gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
  248. {
  249. struct gomp_thread *thr = gomp_thread ();
  250. struct gomp_work_share *ws = thr->ts.work_share;
  251. struct gomp_team *team = thr->ts.team;
  252. gomp_ull nthreads = team ? team->nthreads : 1;
  253. gomp_ull start, end, nend, incr;
  254. gomp_ull chunk_size;
  255. start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
  256. end = ws->end_ull;
  257. incr = ws->incr_ull;
  258. chunk_size = ws->chunk_size_ull;
  259. while (1)
  260. {
  261. gomp_ull n, q;
  262. gomp_ull tmp;
  263. if (start == end)
  264. return false;
  265. if (__builtin_expect (ws->mode, 0) == 0)
  266. n = (end - start) / incr;
  267. else
  268. n = (start - end) / -incr;
  269. q = (n + nthreads - 1) / nthreads;
  270. if (q < chunk_size)
  271. q = chunk_size;
  272. if (__builtin_expect (q <= n, 1))
  273. nend = start + q * incr;
  274. else
  275. nend = end;
  276. tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
  277. if (__builtin_expect (tmp == start, 1))
  278. break;
  279. start = tmp;
  280. }
  281. *pstart = start;
  282. *pend = nend;
  283. return true;
  284. }
  285. #endif /* HAVE_SYNC_BUILTINS */