offload_omp_host.cpp 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267
  1. /*
  2. Copyright (c) 2014-2016 Intel Corporation. All Rights Reserved.
  3. Redistribution and use in source and binary forms, with or without
  4. modification, are permitted provided that the following conditions
  5. are met:
  6. * Redistributions of source code must retain the above copyright
  7. notice, this list of conditions and the following disclaimer.
  8. * Redistributions in binary form must reproduce the above copyright
  9. notice, this list of conditions and the following disclaimer in the
  10. documentation and/or other materials provided with the distribution.
  11. * Neither the name of Intel Corporation nor the names of its
  12. contributors may be used to endorse or promote products derived
  13. from this software without specific prior written permission.
  14. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  15. "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  16. LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  17. A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  18. HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  19. SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  20. LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  21. DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  22. THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include <omp.h>
  27. //#include <stdlib.h>
  28. //#include "offload.h"
  29. #include "compiler_if_host.h"
  30. // OpenMP API
  31. void omp_set_default_device(int num) __GOMP_NOTHROW
  32. {
  33. if (num >= 0) {
  34. __omp_device_num = num;
  35. }
  36. }
  37. int omp_get_default_device(void) __GOMP_NOTHROW
  38. {
  39. return __omp_device_num;
  40. }
  41. int omp_get_num_devices() __GOMP_NOTHROW
  42. {
  43. __offload_init_library();
  44. return mic_engines_total;
  45. }
  46. // OpenMP 4.5 APIs
  47. // COI supports 3-dim multiD transfers
  48. #define MAX_ARRAY_RANK 3
  49. int omp_get_initial_device(
  50. void
  51. ) __GOMP_NOTHROW
  52. {
  53. return -1;
  54. }
  55. void* omp_target_alloc(
  56. size_t size,
  57. int device_num
  58. ) __GOMP_NOTHROW
  59. {
  60. __offload_init_library();
  61. OFFLOAD_TRACE(2, "omp_target_alloc(%lld, %d)\n", size, device_num);
  62. if (device_num < -1) {
  63. LIBOFFLOAD_ERROR(c_invalid_device_number);
  64. exit(1);
  65. }
  66. void* result = 0;
  67. // malloc on CPU
  68. if (device_num == -1) {
  69. // We do not check for malloc returning NULL because the
  70. // specification of this API includes the possibility of failure.
  71. // The user will check the returned result
  72. result = malloc(size);
  73. return result;
  74. }
  75. OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(
  76. TARGET_MIC, device_num, 0, NULL, __func__, 0);
  77. if (ofld != 0) {
  78. VarDesc vars[2] = {0};
  79. vars[0].type.src = c_data;
  80. vars[0].type.dst = c_data;
  81. vars[0].direction.bits = c_parameter_in;
  82. vars[0].size = sizeof(size);
  83. vars[0].count = 1;
  84. vars[0].ptr = &size;
  85. vars[1].type.src = c_data;
  86. vars[1].type.dst = c_data;
  87. vars[1].direction.bits = c_parameter_out;
  88. vars[1].size = sizeof(result);
  89. vars[1].count = 1;
  90. vars[1].ptr = &result;
  91. OFFLOAD_OFFLOAD(ofld, "omp_target_alloc_target",
  92. 0, 2, vars, NULL, 0, 0, 0);
  93. }
  94. return result;
  95. }
  96. void omp_target_free(
  97. void *device_ptr,
  98. int device_num
  99. ) __GOMP_NOTHROW
  100. {
  101. __offload_init_library();
  102. OFFLOAD_TRACE(2, "omp_target_free(%p, %d)\n", device_ptr, device_num);
  103. if (device_num < -1) {
  104. LIBOFFLOAD_ERROR(c_invalid_device_number);
  105. exit(1);
  106. }
  107. // free on CPU
  108. if (device_num == -1) {
  109. free(device_ptr);
  110. return;
  111. }
  112. OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(
  113. TARGET_MIC, device_num, 0, NULL, __func__, 0);
  114. if (ofld) {
  115. VarDesc vars[1] = {0};
  116. vars[0].type.src = c_data;
  117. vars[0].type.dst = c_data;
  118. vars[0].direction.bits = c_parameter_in;
  119. vars[0].size = sizeof(device_ptr);
  120. vars[0].count = 1;
  121. vars[0].ptr = &device_ptr;
  122. OFFLOAD_OFFLOAD(ofld, "omp_target_free_target",
  123. 0, 1, vars, NULL, 0, 0, 0);
  124. }
  125. }
  126. int omp_target_is_present(
  127. void *ptr,
  128. int device_num
  129. ) __GOMP_NOTHROW
  130. {
  131. __offload_init_library();
  132. OFFLOAD_TRACE(2, "omp_target_is_present(%p, %d)\n", ptr, device_num);
  133. if (device_num < -1) {
  134. LIBOFFLOAD_ERROR(c_invalid_device_number);
  135. exit(1);
  136. }
  137. if (device_num == -1) {
  138. return false;
  139. }
  140. // If OpenMP allows wrap-around for device numbers, enable next line
  141. //device_num %= mic_engines_total;
  142. // lookup existing association in pointer table
  143. PtrData* ptr_data = mic_engines[device_num].find_ptr_data(ptr);
  144. if (ptr_data == 0) {
  145. OFFLOAD_TRACE(3, "Address %p is not mapped on device %d\n",
  146. ptr, device_num);
  147. return false;
  148. }
  149. OFFLOAD_TRACE(3, "Address %p found mapped on device %d\n",
  150. ptr, device_num);
  151. return true;
  152. }
  153. int omp_target_memcpy(
  154. void *dst,
  155. void *src,
  156. size_t length,
  157. size_t dst_offset,
  158. size_t src_offset,
  159. int dst_device,
  160. int src_device
  161. ) __GOMP_NOTHROW
  162. {
  163. __offload_init_library();
  164. OFFLOAD_TRACE(2, "omp_target_memcpy(%p, %p, %lld, %lld, %lld, %d, %d)\n",
  165. dst, src, length, dst_offset, src_offset, dst_device, src_device);
  166. if (dst_device < -1 || src_device < -1) {
  167. LIBOFFLOAD_ERROR(c_invalid_device_number);
  168. exit(1);
  169. }
  170. char* srcp = (char *)src + src_offset;
  171. char* dstp = (char *)dst + dst_offset;
  172. if (src_device == -1) {
  173. // Source is CPU
  174. if (dst_device == -1) {
  175. // CPU -> CPU
  176. memcpy(dstp, srcp, length);
  177. return 0;
  178. } else {
  179. // CPU -> MIC
  180. // COIBufferWrite
  181. // If OpenMP allows wrap-around for device numbers, enable next line
  182. //dst_device %= mic_engines_total;
  183. OFFLOAD_TRACE(3, "Creating buffer from sink memory %llx\n", dstp);
  184. COIBUFFER mic_buf;
  185. COIRESULT res = COI::BufferCreateFromMemory(length,
  186. COI_BUFFER_NORMAL, COI_SINK_MEMORY, dstp,
  187. 1, &mic_engines[dst_device].get_process(),
  188. &mic_buf);
  189. if (res != COI_SUCCESS) {
  190. LIBOFFLOAD_ERROR(c_buf_create_from_mem, res);
  191. return 1;
  192. }
  193. res = COI::BufferWrite(mic_buf, 0, srcp, length,
  194. COI_COPY_UNSPECIFIED, 0, 0, 0);
  195. if (res != COI_SUCCESS) {
  196. LIBOFFLOAD_ERROR(c_buf_write, res);
  197. return 1;
  198. }
  199. res = COI::BufferDestroy(mic_buf);
  200. if (res != COI_SUCCESS) {
  201. LIBOFFLOAD_ERROR(c_buf_destroy, res);
  202. return 1;
  203. }
  204. return 0;
  205. }
  206. } else {
  207. // Source is device
  208. if (dst_device == -1) {
  209. // MIC -> CPU
  210. // COIBufferRead
  211. // If OpenMP allows wrap-around for device numbers, enable next line
  212. //src_device %= mic_engines_total;
  213. OFFLOAD_TRACE(3, "Creating buffer from sink memory %llx\n", srcp);
  214. COIBUFFER mic_buf;
  215. COIRESULT res = COI::BufferCreateFromMemory(length,
  216. COI_BUFFER_NORMAL, COI_SINK_MEMORY, srcp,
  217. 1, &mic_engines[src_device].get_process(),
  218. &mic_buf);
  219. if (res != COI_SUCCESS) {
  220. LIBOFFLOAD_ERROR(c_buf_create_from_mem, res);
  221. return 1;
  222. }
  223. res = COI::BufferRead(mic_buf, 0, dstp, length,
  224. COI_COPY_UNSPECIFIED, 0, 0, 0);
  225. if (res != COI_SUCCESS) {
  226. LIBOFFLOAD_ERROR(c_buf_read, res);
  227. return 1;
  228. }
  229. res = COI::BufferDestroy(mic_buf);
  230. if (res != COI_SUCCESS) {
  231. LIBOFFLOAD_ERROR(c_buf_destroy, res);
  232. return 1;
  233. }
  234. return 0;
  235. } else {
  236. // some MIC -> some MIC
  237. if (src_device == dst_device) {
  238. // MIC local copy will be done as remote memcpy
  239. OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(TARGET_MIC, src_device,
  240. 0, NULL, __func__, 0);
  241. if (ofld) {
  242. VarDesc vars[3] = {0};
  243. vars[0].type.src = c_data;
  244. vars[0].type.dst = c_data;
  245. vars[0].direction.bits = c_parameter_in;
  246. vars[0].size = sizeof(dstp);
  247. vars[0].count = 1;
  248. vars[0].ptr = &dstp;
  249. vars[1].type.src = c_data;
  250. vars[1].type.dst = c_data;
  251. vars[1].direction.bits = c_parameter_in;
  252. vars[1].size = sizeof(srcp);
  253. vars[1].count = 1;
  254. vars[1].ptr = &srcp;
  255. vars[2].type.src = c_data;
  256. vars[2].type.dst = c_data;
  257. vars[2].direction.bits = c_parameter_in;
  258. vars[2].size = sizeof(length);
  259. vars[2].count = 1;
  260. vars[2].ptr = &length;
  261. OFFLOAD_OFFLOAD(ofld, "omp_target_memcpy_target",
  262. 0, 3, vars, NULL, 0, 0, 0);
  263. return 0;
  264. } else {
  265. return 1;
  266. }
  267. } else {
  268. // MICx -> MICy
  269. // Allocate CPU buffer
  270. char *cpu_mem = (char *)malloc(length);
  271. if (cpu_mem == 0) {
  272. LIBOFFLOAD_ERROR(c_malloc);
  273. return 1;
  274. }
  275. int retval = 1;
  276. if (omp_target_memcpy(
  277. cpu_mem, srcp, length, 0, 0, -1, src_device) == 0) {
  278. retval = omp_target_memcpy(
  279. dstp, cpu_mem, length, 0, 0, dst_device, -1);
  280. }
  281. free(cpu_mem);
  282. return retval;
  283. }
  284. }
  285. }
  286. }
  287. static size_t bytesize_at_this_dimension(
  288. size_t element_size,
  289. int num_dims,
  290. const size_t* dimensions
  291. )
  292. {
  293. if (num_dims > 1) {
  294. return dimensions[1] *
  295. bytesize_at_this_dimension(
  296. element_size, num_dims-1, dimensions+1);
  297. } else {
  298. return element_size;
  299. }
  300. }
  301. static void memcpy_rect(
  302. char *dst,
  303. char *src,
  304. size_t element_size,
  305. int num_dims,
  306. const size_t *volume,
  307. const size_t *dst_offsets,
  308. const size_t *src_offsets,
  309. const size_t *dst_dimensions,
  310. const size_t *src_dimensions
  311. )
  312. {
  313. if (num_dims > 1) {
  314. int count = volume[0];
  315. int dst_index = dst_offsets[0];
  316. int src_index = src_offsets[0];
  317. size_t dst_element_size =
  318. bytesize_at_this_dimension(element_size, num_dims, dst_dimensions);
  319. size_t src_element_size =
  320. bytesize_at_this_dimension(element_size, num_dims, src_dimensions);
  321. for (; count>0; dst_index++, src_index++, count--) {
  322. memcpy_rect(dst+dst_element_size*dst_index,
  323. src+src_element_size*src_index,
  324. element_size, num_dims-1, volume+1,
  325. dst_offsets+1, src_offsets+1,
  326. dst_dimensions+1, src_dimensions+1);
  327. }
  328. } else {
  329. memcpy(dst+dst_offsets[0]*element_size,
  330. src+src_offsets[0]*element_size,
  331. element_size * volume[0]);
  332. }
  333. }
  334. int omp_target_memcpy_rect(
  335. void *dst_,
  336. void *src_,
  337. size_t element_size,
  338. int num_dims,
  339. const size_t *volume,
  340. const size_t *dst_offsets,
  341. const size_t *src_offsets,
  342. const size_t *dst_dimensions,
  343. const size_t *src_dimensions,
  344. int dst_device,
  345. int src_device
  346. ) __GOMP_NOTHROW
  347. {
  348. char *dst = (char *)dst_;
  349. char *src = (char *)src_;
  350. __offload_init_library();
  351. OFFLOAD_TRACE(2, "omp_target_memcpy_rect(%p, %p, %lld, %d, "
  352. "%p, %p, %p, %p, %p, %d, %d)\n",
  353. dst, src, element_size, num_dims,
  354. volume, dst_offsets, src_offsets,
  355. dst_dimensions, src_dimensions, dst_device, src_device);
  356. // MAX_ARRAY_RANK dimensions are supported
  357. if (dst == 0 && src == 0) {
  358. return MAX_ARRAY_RANK;
  359. }
  360. if (num_dims < 1 || num_dims > MAX_ARRAY_RANK ||
  361. element_size < 1 ||
  362. volume == 0 || dst_offsets == 0 || src_offsets == 0 ||
  363. dst_dimensions == 0 || src_dimensions == 0) {
  364. return 1;
  365. }
  366. if (dst_device < -1 || src_device < -1) {
  367. LIBOFFLOAD_ERROR(c_invalid_device_number);
  368. exit(1);
  369. }
  370. if (src_device == -1) {
  371. // Source is CPU
  372. if (dst_device == -1) {
  373. // CPU -> CPU
  374. memcpy_rect((char*)dst, (char*)src, element_size, num_dims, volume,
  375. dst_offsets, src_offsets,
  376. dst_dimensions, src_dimensions);
  377. return 0;
  378. } else {
  379. // CPU -> MIC
  380. // COIBufferWriteMultiD
  381. struct arr_desc dst_desc;
  382. struct arr_desc src_desc;
  383. dst_desc.base = (int64_t)dst;
  384. dst_desc.rank = num_dims;
  385. src_desc.base = (int64_t)src;
  386. src_desc.rank = num_dims;
  387. for (int i=0; i<num_dims; i++)
  388. {
  389. dst_desc.dim[i].size = bytesize_at_this_dimension(
  390. element_size,
  391. num_dims - i,
  392. dst_dimensions + i);
  393. dst_desc.dim[i].lindex = 0;
  394. dst_desc.dim[i].lower = dst_offsets[i];
  395. dst_desc.dim[i].upper = dst_offsets[i] + volume[i] - 1;
  396. dst_desc.dim[i].stride = 1;
  397. src_desc.dim[i].size = bytesize_at_this_dimension(
  398. element_size,
  399. num_dims - i,
  400. src_dimensions + i);
  401. src_desc.dim[i].lindex = 0;
  402. src_desc.dim[i].lower = src_offsets[i];
  403. src_desc.dim[i].upper = src_offsets[i] + volume[i] - 1;
  404. src_desc.dim[i].stride = 1;
  405. }
  406. __arr_desc_dump("", "dst", (const Arr_Desc*)&dst_desc, false, false);
  407. __arr_desc_dump("", "src", (const Arr_Desc*)&src_desc, false, false);
  408. // If OpenMP allows wrap-around for device numbers, enable next line
  409. //dst_device %= mic_engines_total;
  410. // Compute MIC buffer size
  411. size_t dst_length = dst_dimensions[0] * bytesize_at_this_dimension(
  412. element_size,
  413. num_dims,
  414. dst_dimensions);
  415. OFFLOAD_TRACE(3,
  416. "Creating buffer from sink memory %llx of size %lld\n",
  417. dst, dst_length);
  418. COIBUFFER mic_buf;
  419. COIRESULT res = COI::BufferCreateFromMemory(dst_length,
  420. COI_BUFFER_NORMAL, COI_SINK_MEMORY, dst,
  421. 1, &mic_engines[dst_device].get_process(),
  422. &mic_buf);
  423. if (res != COI_SUCCESS) {
  424. LIBOFFLOAD_ERROR(c_buf_create_from_mem, res);
  425. return 1;
  426. }
  427. res = COI::BufferWriteMultiD(mic_buf,
  428. mic_engines[dst_device].get_process(),
  429. 0, &dst_desc, &src_desc,
  430. COI_COPY_UNSPECIFIED, 0, 0, 0);
  431. if (res != COI_SUCCESS) {
  432. LIBOFFLOAD_ERROR(c_buf_write, res);
  433. return 1;
  434. }
  435. res = COI::BufferDestroy(mic_buf);
  436. if (res != COI_SUCCESS) {
  437. LIBOFFLOAD_ERROR(c_buf_destroy, res);
  438. return 1;
  439. }
  440. return 0;
  441. }
  442. } else {
  443. // Source is device
  444. if (dst_device == -1) {
  445. // COIBufferReadMultiD
  446. struct arr_desc dst_desc;
  447. struct arr_desc src_desc;
  448. dst_desc.base = (int64_t)dst;
  449. dst_desc.rank = num_dims;
  450. src_desc.base = (int64_t)src;
  451. src_desc.rank = num_dims;
  452. for (int i=0; i<num_dims; i++)
  453. {
  454. dst_desc.dim[i].size = bytesize_at_this_dimension(
  455. element_size,
  456. num_dims - i,
  457. dst_dimensions + i);
  458. dst_desc.dim[i].lindex = 0;
  459. dst_desc.dim[i].lower = dst_offsets[i];
  460. dst_desc.dim[i].upper = dst_offsets[i] + volume[i] - 1;
  461. dst_desc.dim[i].stride = 1;
  462. src_desc.dim[i].size = bytesize_at_this_dimension(
  463. element_size,
  464. num_dims - i,
  465. src_dimensions + i);
  466. src_desc.dim[i].lindex = 0;
  467. src_desc.dim[i].lower = src_offsets[i];
  468. src_desc.dim[i].upper = src_offsets[i] + volume[i] - 1;
  469. src_desc.dim[i].stride = 1;
  470. }
  471. __arr_desc_dump("", "dst", (const Arr_Desc*)&dst_desc, false, false);
  472. __arr_desc_dump("", "src", (const Arr_Desc*)&src_desc, false, false);
  473. // If OpenMP allows wrap-around for device numbers, enable next line
  474. //src_device %= mic_engines_total;
  475. // Compute MIC buffer size
  476. size_t src_length = src_dimensions[0] * bytesize_at_this_dimension(
  477. element_size,
  478. num_dims,
  479. src_dimensions);
  480. OFFLOAD_TRACE(3,
  481. "Creating buffer from sink memory %llx of size %lld\n",
  482. src, src_length);
  483. COIBUFFER mic_buf;
  484. COIRESULT res = COI::BufferCreateFromMemory(src_length,
  485. COI_BUFFER_NORMAL, COI_SINK_MEMORY, src,
  486. 1, &mic_engines[src_device].get_process(),
  487. &mic_buf);
  488. if (res != COI_SUCCESS) {
  489. LIBOFFLOAD_ERROR(c_buf_create_from_mem, res);
  490. return 1;
  491. }
  492. res = COI::BufferReadMultiD(mic_buf, 0,
  493. &dst_desc, &src_desc,
  494. COI_COPY_UNSPECIFIED, 0, 0, 0);
  495. if (res != COI_SUCCESS) {
  496. LIBOFFLOAD_ERROR(c_buf_write, res);
  497. return 1;
  498. }
  499. res = COI::BufferDestroy(mic_buf);
  500. if (res != COI_SUCCESS) {
  501. LIBOFFLOAD_ERROR(c_buf_destroy, res);
  502. return 1;
  503. }
  504. return 0;
  505. } else {
  506. // some MIC -> some MIC
  507. if (src_device == dst_device) {
  508. // MIC local copy will be done as remote memcpy_rect
  509. struct parameters {
  510. void *dst;
  511. void *src;
  512. size_t element_size;
  513. int num_dims;
  514. size_t array_info[MAX_ARRAY_RANK*5];
  515. } parameters = {dst, src, element_size, num_dims};
  516. int result;
  517. for (int i=0; i<num_dims; i++)
  518. {
  519. parameters.array_info[i] = volume[i];
  520. parameters.array_info[i+num_dims] = dst_offsets[i];
  521. parameters.array_info[i+num_dims*2] = src_offsets[i];
  522. parameters.array_info[i+num_dims*3] = dst_dimensions[i];
  523. parameters.array_info[i+num_dims*4] = src_dimensions[i];
  524. }
  525. OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(TARGET_MIC, src_device,
  526. 0, NULL, __func__, 0);
  527. if (ofld) {
  528. VarDesc vars[1] = {0};
  529. vars[0].type.src = c_data;
  530. vars[0].type.dst = c_data;
  531. vars[0].direction.bits = c_parameter_in;
  532. vars[0].size = sizeof(parameters) -
  533. (MAX_ARRAY_RANK - num_dims) *
  534. 5 * sizeof(size_t);
  535. vars[0].count = 1;
  536. vars[0].ptr = &parameters;
  537. OFFLOAD_OFFLOAD(ofld, "omp_target_memcpy_rect_target",
  538. 0, 1, vars, NULL, 0, 0, 0);
  539. return 0;
  540. } else {
  541. return 1;
  542. }
  543. } else {
  544. // MICx -> MICy
  545. // Compute transfer byte-count
  546. size_t dst_length = element_size;
  547. for (int i=0; i<num_dims; i++) {
  548. dst_length *= volume[i];
  549. }
  550. // Allocate CPU buffer
  551. char *cpu_mem = (char *)malloc(dst_length);
  552. if (cpu_mem == 0) {
  553. LIBOFFLOAD_ERROR(c_malloc);
  554. return 1;
  555. }
  556. // Create CPU offset and dimension arrays
  557. // The CPU array collects the data in a contiguous block
  558. size_t cpu_offsets[MAX_ARRAY_RANK];
  559. size_t cpu_dimensions[MAX_ARRAY_RANK];
  560. for (int i=0; i<num_dims; i++) {
  561. cpu_offsets[i] = 0;
  562. cpu_dimensions[i] = volume[i];
  563. }
  564. int retval = 1;
  565. if (omp_target_memcpy_rect(
  566. cpu_mem, src, element_size, num_dims, volume,
  567. cpu_offsets, src_offsets,
  568. cpu_dimensions, src_dimensions,
  569. -1, src_device) == 0) {
  570. retval = omp_target_memcpy_rect(
  571. dst, cpu_mem, element_size, num_dims, volume,
  572. dst_offsets, cpu_offsets,
  573. dst_dimensions, cpu_dimensions,
  574. dst_device, -1);
  575. }
  576. free(cpu_mem);
  577. return retval;
  578. }
  579. }
  580. }
  581. }
  582. // host_ptr is key in table that yields association on device
  583. // A COIBUFFER of specified size is created from the memory at
  584. // device_ptr+device_offset on device_num
  585. int omp_target_associate_ptr(
  586. void *host_ptr,
  587. void *device_ptr,
  588. size_t size,
  589. size_t device_offset,
  590. int device_num
  591. ) __GOMP_NOTHROW
  592. {
  593. COIRESULT res;
  594. __offload_init_library();
  595. OFFLOAD_TRACE(2, "omp_target_associate_ptr(%p, %p, %lld, %lld, %d)\n",
  596. host_ptr, device_ptr, size, device_offset, device_num);
  597. if (device_num < -1) {
  598. LIBOFFLOAD_ERROR(c_invalid_device_number);
  599. exit(1);
  600. }
  601. // Associating to CPU is treated as failure
  602. if (device_num == -1) {
  603. return 1;
  604. }
  605. // An incorrect size is treated as failure
  606. if (size < 0) {
  607. return 1;
  608. }
  609. // If OpenMP allows wrap-around for device numbers, enable next line
  610. //Engine& device = mic_engines[device_num % mic_engines_total];
  611. Engine& device = mic_engines[device_num];
  612. // Does host pointer have association already?
  613. // lookup existing association in pointer table
  614. PtrData* ptr_data = device.find_ptr_data(host_ptr);
  615. if (ptr_data != 0) {
  616. OFFLOAD_TRACE(3, "Address %p is already mapped on device %d\n",
  617. host_ptr, device_num);
  618. // Is current device pointer and offset same as existing?
  619. if ((void*)ptr_data->mic_addr == device_ptr &&
  620. (size_t)ptr_data->alloc_disp == device_offset) {
  621. return 0;
  622. } else {
  623. return 1;
  624. }
  625. }
  626. // Create association
  627. OFFLOAD_TRACE(3, "Creating association for data: addr %p, length %lld\n",
  628. host_ptr, size);
  629. bool is_new;
  630. ptr_data = device.insert_ptr_data(host_ptr, size, is_new);
  631. ptr_data->is_omp_associate = true;
  632. // create CPU buffer
  633. OFFLOAD_TRACE(3,
  634. "Creating buffer from source memory %p, length %lld\n",
  635. host_ptr, size);
  636. // result is not checked because we can continue without cpu
  637. // buffer. In this case we will use COIBufferRead/Write
  638. // instead of COIBufferCopy.
  639. COI::BufferCreateFromMemory(size,
  640. COI_BUFFER_OPENCL,
  641. 0,
  642. host_ptr,
  643. 1,
  644. &device.get_process(),
  645. &ptr_data->cpu_buf);
  646. // create MIC buffer
  647. OFFLOAD_TRACE(3,
  648. "Creating buffer from sink memory: addr %p, size %lld\n",
  649. (char *)device_ptr + device_offset, size);
  650. res = COI::BufferCreateFromMemory(size,
  651. COI_BUFFER_NORMAL,
  652. COI_SINK_MEMORY,
  653. device_ptr,
  654. 1,
  655. &device.get_process(),
  656. &ptr_data->mic_buf);
  657. if (res != COI_SUCCESS) {
  658. ptr_data->alloc_ptr_data_lock.unlock();
  659. return 1;
  660. }
  661. // make buffer valid on the device.
  662. res = COI::BufferSetState(ptr_data->mic_buf,
  663. device.get_process(),
  664. COI_BUFFER_VALID,
  665. COI_BUFFER_NO_MOVE,
  666. 0, 0, 0);
  667. if (res != COI_SUCCESS) {
  668. ptr_data->alloc_ptr_data_lock.unlock();
  669. return 1;
  670. }
  671. res = COI::BufferSetState(ptr_data->mic_buf,
  672. COI_PROCESS_SOURCE,
  673. COI_BUFFER_INVALID,
  674. COI_BUFFER_NO_MOVE,
  675. 0, 0, 0);
  676. if (res != COI_SUCCESS) {
  677. ptr_data->alloc_ptr_data_lock.unlock();
  678. return 1;
  679. }
  680. ptr_data->alloc_disp = device_offset;
  681. ptr_data->alloc_ptr_data_lock.unlock();
  682. return 0;
  683. }
  684. int omp_target_disassociate_ptr(
  685. void *host_ptr,
  686. int device_num
  687. ) __GOMP_NOTHROW
  688. {
  689. COIRESULT res;
  690. __offload_init_library();
  691. OFFLOAD_TRACE(2, "omp_target_disassociate_ptr(%p, %d)\n",
  692. host_ptr, device_num);
  693. if (device_num < -1) {
  694. LIBOFFLOAD_ERROR(c_invalid_device_number);
  695. exit(1);
  696. }
  697. // Dissociating from CPU is treated as failure
  698. if (device_num == -1) {
  699. return 1;
  700. }
  701. // If OpenMP allows wrap-around for device numbers, enable next line
  702. //Engine& device = mic_engines[device_num % mic_engines_total];
  703. Engine& device = mic_engines[device_num];
  704. // Lookup existing association in pointer table
  705. PtrData* ptr_data = device.find_ptr_data(host_ptr);
  706. // Attempt to disassociate unassociated pointer is a failure
  707. if (ptr_data == 0) {
  708. return 1;
  709. }
  710. // Destroy buffers
  711. if (ptr_data->cpu_buf != 0) {
  712. OFFLOAD_TRACE(3, "Destroying CPU buffer %p\n", ptr_data->cpu_buf);
  713. COI::BufferDestroy(ptr_data->cpu_buf);
  714. }
  715. if (ptr_data->mic_buf != 0) {
  716. OFFLOAD_TRACE(3, "Destroying MIC buffer %p\n", ptr_data->mic_buf);
  717. COI::BufferDestroy(ptr_data->mic_buf);
  718. }
  719. // Remove association from map
  720. OFFLOAD_TRACE(3, "Removing association for addr %p\n",
  721. ptr_data->cpu_addr.start());
  722. device.remove_ptr_data(ptr_data->cpu_addr.start());
  723. return 0;
  724. }
  725. // End of OpenMP 4.5 APIs
  726. // OpenMP API wrappers
  727. static void omp_set_int_target(
  728. TARGET_TYPE target_type,
  729. int target_number,
  730. int setting,
  731. const char* f_name
  732. )
  733. {
  734. OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
  735. f_name, 0);
  736. if (ofld) {
  737. VarDesc vars[1] = {0};
  738. vars[0].type.src = c_data;
  739. vars[0].type.dst = c_data;
  740. vars[0].direction.bits = c_parameter_in;
  741. vars[0].size = sizeof(int);
  742. vars[0].count = 1;
  743. vars[0].ptr = &setting;
  744. OFFLOAD_OFFLOAD(ofld, f_name, 0, 1, vars, NULL, 0, 0, 0);
  745. }
  746. }
  747. static int omp_get_int_target(
  748. TARGET_TYPE target_type,
  749. int target_number,
  750. const char * f_name
  751. )
  752. {
  753. int setting = 0;
  754. OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
  755. f_name, 0);
  756. if (ofld) {
  757. VarDesc vars[1] = {0};
  758. vars[0].type.src = c_data;
  759. vars[0].type.dst = c_data;
  760. vars[0].direction.bits = c_parameter_out;
  761. vars[0].size = sizeof(int);
  762. vars[0].count = 1;
  763. vars[0].ptr = &setting;
  764. OFFLOAD_OFFLOAD(ofld, f_name, 0, 1, vars, NULL, 0, 0, 0);
  765. }
  766. return setting;
  767. }
  768. void omp_set_num_threads_target(
  769. TARGET_TYPE target_type,
  770. int target_number,
  771. int num_threads
  772. )
  773. {
  774. omp_set_int_target(target_type, target_number, num_threads,
  775. "omp_set_num_threads_target");
  776. }
  777. int omp_get_max_threads_target(
  778. TARGET_TYPE target_type,
  779. int target_number
  780. )
  781. {
  782. return omp_get_int_target(target_type, target_number,
  783. "omp_get_max_threads_target");
  784. }
  785. int omp_get_num_procs_target(
  786. TARGET_TYPE target_type,
  787. int target_number
  788. )
  789. {
  790. return omp_get_int_target(target_type, target_number,
  791. "omp_get_num_procs_target");
  792. }
  793. void omp_set_dynamic_target(
  794. TARGET_TYPE target_type,
  795. int target_number,
  796. int num_threads
  797. )
  798. {
  799. omp_set_int_target(target_type, target_number, num_threads,
  800. "omp_set_dynamic_target");
  801. }
  802. int omp_get_dynamic_target(
  803. TARGET_TYPE target_type,
  804. int target_number
  805. )
  806. {
  807. return omp_get_int_target(target_type, target_number,
  808. "omp_get_dynamic_target");
  809. }
  810. void omp_set_nested_target(
  811. TARGET_TYPE target_type,
  812. int target_number,
  813. int nested
  814. )
  815. {
  816. omp_set_int_target(target_type, target_number, nested,
  817. "omp_set_nested_target");
  818. }
  819. int omp_get_nested_target(
  820. TARGET_TYPE target_type,
  821. int target_number
  822. )
  823. {
  824. return omp_get_int_target(target_type, target_number,
  825. "omp_get_nested_target");
  826. }
  827. void omp_set_schedule_target(
  828. TARGET_TYPE target_type,
  829. int target_number,
  830. omp_sched_t kind,
  831. int modifier
  832. )
  833. {
  834. OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
  835. __func__, 0);
  836. if (ofld != 0) {
  837. VarDesc vars[2] = {0};
  838. vars[0].type.src = c_data;
  839. vars[0].type.dst = c_data;
  840. vars[0].direction.bits = c_parameter_in;
  841. vars[0].size = sizeof(omp_sched_t);
  842. vars[0].count = 1;
  843. vars[0].ptr = &kind;
  844. vars[1].type.src = c_data;
  845. vars[1].type.dst = c_data;
  846. vars[1].direction.bits = c_parameter_in;
  847. vars[1].size = sizeof(int);
  848. vars[1].count = 1;
  849. vars[1].ptr = &modifier;
  850. OFFLOAD_OFFLOAD(ofld, "omp_set_schedule_target",
  851. 0, 2, vars, NULL, 0, 0, 0);
  852. }
  853. }
  854. void omp_get_schedule_target(
  855. TARGET_TYPE target_type,
  856. int target_number,
  857. omp_sched_t *kind,
  858. int *modifier
  859. )
  860. {
  861. OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
  862. __func__, 0);
  863. if (ofld != 0) {
  864. VarDesc vars[2] = {0};
  865. vars[0].type.src = c_data;
  866. vars[0].type.dst = c_data;
  867. vars[0].direction.bits = c_parameter_out;
  868. vars[0].size = sizeof(omp_sched_t);
  869. vars[0].count = 1;
  870. vars[0].ptr = kind;
  871. vars[1].type.src = c_data;
  872. vars[1].type.dst = c_data;
  873. vars[1].direction.bits = c_parameter_out;
  874. vars[1].size = sizeof(int);
  875. vars[1].count = 1;
  876. vars[1].ptr = modifier;
  877. OFFLOAD_OFFLOAD(ofld, "omp_get_schedule_target",
  878. 0, 2, vars, NULL, 0, 0, 0);
  879. }
  880. }
  881. // lock API functions
  882. void omp_init_lock_target(
  883. TARGET_TYPE target_type,
  884. int target_number,
  885. omp_lock_target_t *lock
  886. )
  887. {
  888. OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
  889. __func__, 0);
  890. if (ofld != 0) {
  891. VarDesc vars[1] = {0};
  892. vars[0].type.src = c_data;
  893. vars[0].type.dst = c_data;
  894. vars[0].direction.bits = c_parameter_out;
  895. vars[0].size = sizeof(omp_lock_target_t);
  896. vars[0].count = 1;
  897. vars[0].ptr = lock;
  898. OFFLOAD_OFFLOAD(ofld, "omp_init_lock_target",
  899. 0, 1, vars, NULL, 0, 0, 0);
  900. }
  901. }
  902. void omp_destroy_lock_target(
  903. TARGET_TYPE target_type,
  904. int target_number,
  905. omp_lock_target_t *lock
  906. )
  907. {
  908. OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
  909. __func__, 0);
  910. if (ofld != 0) {
  911. VarDesc vars[1] = {0};
  912. vars[0].type.src = c_data;
  913. vars[0].type.dst = c_data;
  914. vars[0].direction.bits = c_parameter_in;
  915. vars[0].size = sizeof(omp_lock_target_t);
  916. vars[0].count = 1;
  917. vars[0].ptr = lock;
  918. OFFLOAD_OFFLOAD(ofld, "omp_destroy_lock_target",
  919. 0, 1, vars, NULL, 0, 0, 0);
  920. }
  921. }
  922. void omp_set_lock_target(
  923. TARGET_TYPE target_type,
  924. int target_number,
  925. omp_lock_target_t *lock
  926. )
  927. {
  928. OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
  929. __func__, 0);
  930. if (ofld != 0) {
  931. VarDesc vars[1] = {0};
  932. vars[0].type.src = c_data;
  933. vars[0].type.dst = c_data;
  934. vars[0].direction.bits = c_parameter_inout;
  935. vars[0].size = sizeof(omp_lock_target_t);
  936. vars[0].count = 1;
  937. vars[0].ptr = lock;
  938. OFFLOAD_OFFLOAD(ofld, "omp_set_lock_target",
  939. 0, 1, vars, NULL, 0, 0, 0);
  940. }
  941. }
  942. void omp_unset_lock_target(
  943. TARGET_TYPE target_type,
  944. int target_number,
  945. omp_lock_target_t *lock
  946. )
  947. {
  948. OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
  949. __func__, 0);
  950. if (ofld != 0) {
  951. VarDesc vars[1] = {0};
  952. vars[0].type.src = c_data;
  953. vars[0].type.dst = c_data;
  954. vars[0].direction.bits = c_parameter_inout;
  955. vars[0].size = sizeof(omp_lock_target_t);
  956. vars[0].count = 1;
  957. vars[0].ptr = lock;
  958. OFFLOAD_OFFLOAD(ofld, "omp_unset_lock_target",
  959. 0, 1, vars, NULL, 0, 0, 0);
  960. }
  961. }
  962. int omp_test_lock_target(
  963. TARGET_TYPE target_type,
  964. int target_number,
  965. omp_lock_target_t *lock
  966. )
  967. {
  968. int result = 0;
  969. OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
  970. __func__, 0);
  971. if (ofld != 0) {
  972. VarDesc vars[2] = {0};
  973. vars[0].type.src = c_data;
  974. vars[0].type.dst = c_data;
  975. vars[0].direction.bits = c_parameter_inout;
  976. vars[0].size = sizeof(omp_lock_target_t);
  977. vars[0].count = 1;
  978. vars[0].ptr = lock;
  979. vars[1].type.src = c_data;
  980. vars[1].type.dst = c_data;
  981. vars[1].direction.bits = c_parameter_out;
  982. vars[1].size = sizeof(int);
  983. vars[1].count = 1;
  984. vars[1].ptr = &result;
  985. OFFLOAD_OFFLOAD(ofld, "omp_test_lock_target",
  986. 0, 2, vars, NULL, 0, 0, 0);
  987. }
  988. return result;
  989. }
  990. // nested lock API functions
  991. void omp_init_nest_lock_target(
  992. TARGET_TYPE target_type,
  993. int target_number,
  994. omp_nest_lock_target_t *lock
  995. )
  996. {
  997. OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
  998. __func__, 0);
  999. if (ofld != 0) {
  1000. VarDesc vars[1] = {0};
  1001. vars[0].type.src = c_data;
  1002. vars[0].type.dst = c_data;
  1003. vars[0].direction.bits = c_parameter_out;
  1004. vars[0].size = sizeof(omp_nest_lock_target_t);
  1005. vars[0].count = 1;
  1006. vars[0].ptr = lock;
  1007. OFFLOAD_OFFLOAD(ofld, "omp_init_nest_lock_target",
  1008. 0, 1, vars, NULL, 0, 0, 0);
  1009. }
  1010. }
  1011. void omp_destroy_nest_lock_target(
  1012. TARGET_TYPE target_type,
  1013. int target_number,
  1014. omp_nest_lock_target_t *lock
  1015. )
  1016. {
  1017. OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
  1018. __func__, 0);
  1019. if (ofld != 0) {
  1020. VarDesc vars[1] = {0};
  1021. vars[0].type.src = c_data;
  1022. vars[0].type.dst = c_data;
  1023. vars[0].direction.bits = c_parameter_in;
  1024. vars[0].size = sizeof(omp_nest_lock_target_t);
  1025. vars[0].count = 1;
  1026. vars[0].ptr = lock;
  1027. OFFLOAD_OFFLOAD(ofld, "omp_destroy_nest_lock_target",
  1028. 0, 1, vars, NULL, 0, 0, 0);
  1029. }
  1030. }
  1031. void omp_set_nest_lock_target(
  1032. TARGET_TYPE target_type,
  1033. int target_number,
  1034. omp_nest_lock_target_t *lock
  1035. )
  1036. {
  1037. OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
  1038. __func__, 0);
  1039. if (ofld != 0) {
  1040. VarDesc vars[1] = {0};
  1041. vars[0].type.src = c_data;
  1042. vars[0].type.dst = c_data;
  1043. vars[0].direction.bits = c_parameter_inout;
  1044. vars[0].size = sizeof(omp_nest_lock_target_t);
  1045. vars[0].count = 1;
  1046. vars[0].ptr = lock;
  1047. OFFLOAD_OFFLOAD(ofld, "omp_set_nest_lock_target",
  1048. 0, 1, vars, NULL, 0, 0, 0);
  1049. }
  1050. }
  1051. void omp_unset_nest_lock_target(
  1052. TARGET_TYPE target_type,
  1053. int target_number,
  1054. omp_nest_lock_target_t *lock
  1055. )
  1056. {
  1057. OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
  1058. __func__, 0);
  1059. if (ofld != 0) {
  1060. VarDesc vars[1] = {0};
  1061. vars[0].type.src = c_data;
  1062. vars[0].type.dst = c_data;
  1063. vars[0].direction.bits = c_parameter_inout;
  1064. vars[0].size = sizeof(omp_nest_lock_target_t);
  1065. vars[0].count = 1;
  1066. vars[0].ptr = lock;
  1067. OFFLOAD_OFFLOAD(ofld, "omp_unset_nest_lock_target",
  1068. 0, 1, vars, NULL, 0, 0, 0);
  1069. }
  1070. }
  1071. int omp_test_nest_lock_target(
  1072. TARGET_TYPE target_type,
  1073. int target_number,
  1074. omp_nest_lock_target_t *lock
  1075. )
  1076. {
  1077. int result = 0;
  1078. OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
  1079. __func__, 0);
  1080. if (ofld != 0) {
  1081. VarDesc vars[2] = {0};
  1082. vars[0].type.src = c_data;
  1083. vars[0].type.dst = c_data;
  1084. vars[0].direction.bits = c_parameter_inout;
  1085. vars[0].size = sizeof(omp_nest_lock_target_t);
  1086. vars[0].count = 1;
  1087. vars[0].ptr = lock;
  1088. vars[1].type.src = c_data;
  1089. vars[1].type.dst = c_data;
  1090. vars[1].direction.bits = c_parameter_out;
  1091. vars[1].size = sizeof(int);
  1092. vars[1].count = 1;
  1093. vars[1].ptr = &result;
  1094. OFFLOAD_OFFLOAD(ofld, "omp_test_nest_lock_target",
  1095. 0, 2, vars, NULL, 0, 0, 0);
  1096. }
  1097. return result;
  1098. }