offload_host.cpp 274 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156715771587159716071617162716371647165716671677168716971707171717271737174717571767177717871797180718171827183718471857186718771887189719071917192719371947195719671977198719972007201720272037204720572067207720872097210721172127213721472157216721772187219722072217222722372247225722672277228722972307231723272337234723572367237723872397240724172427243724472457246724772487249725072517252725372547255
  1. /*
  2. Copyright (c) 2014-2016 Intel Corporation. All Rights Reserved.
  3. Redistribution and use in source and binary forms, with or without
  4. modification, are permitted provided that the following conditions
  5. are met:
  6. * Redistributions of source code must retain the above copyright
  7. notice, this list of conditions and the following disclaimer.
  8. * Redistributions in binary form must reproduce the above copyright
  9. notice, this list of conditions and the following disclaimer in the
  10. documentation and/or other materials provided with the distribution.
  11. * Neither the name of Intel Corporation nor the names of its
  12. contributors may be used to endorse or promote products derived
  13. from this software without specific prior written permission.
  14. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  15. "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  16. LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  17. A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  18. HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  19. SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  20. LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  21. DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  22. THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. // Forward declaration as the following 2 functions are declared as friend
  27. // in offload_engine.h.
  28. // CLANG does not like static to been after friend declaration.
  29. static void __offload_init_library_once(void);
  30. static void __offload_fini_library(void);
  31. #include "offload_host.h"
  32. #ifdef MYO_SUPPORT
  33. #include "offload_myo_host.h"
  34. #endif
  35. #include <malloc.h>
  36. #ifndef TARGET_WINNT
  37. #include <alloca.h>
  38. #include <elf.h>
  39. #endif // TARGET_WINNT
  40. #include <errno.h>
  41. #include <fcntl.h>
  42. #include <stdlib.h>
  43. #include <string.h>
  44. #include <sys/stat.h>
  45. #include <sys/types.h>
  46. #include <algorithm>
  47. #include <bitset>
  48. #include <iostream>
  49. #if defined(HOST_WINNT)
  50. #define PATH_SEPARATOR ";"
  51. #else
  52. #define PATH_SEPARATOR ":"
  53. #endif
  54. #define GET_OFFLOAD_NUMBER(timer_data) \
  55. timer_data? timer_data->offload_number : 0
  56. static void (*task_completion_callback)(void *);
  57. extern "C" {
  58. #ifdef TARGET_WINNT
  59. // Windows does not support imports from libraries without actually
  60. // including them as dependence. We don't want to include in the
  61. // dependence since is it used only for Fortran when traceback is enabled.
  62. // Chose to implement it with GetProcAddress.
  63. #define FORTRAN_TRACE_BACK win_for__continue_traceback
  64. int win_for__continue_traceback( _Offload_result coi_offload_result )
  65. {
  66. HINSTANCE hDLL;
  67. int (* TraceBackRoutine)(_Offload_result value);
  68. hDLL = LoadLibrary("libifcoremd.dll");
  69. if (hDLL != 0) {
  70. TraceBackRoutine = (int (*)(_Offload_result)) GetProcAddress(hDLL,
  71. "for__continue_traceback");
  72. if (TraceBackRoutine != 0) {
  73. return TraceBackRoutine(coi_offload_result);
  74. }
  75. else {
  76. OFFLOAD_TRACE(3,
  77. "Cannot find for__continue_traceback routine in libifcorert.dll\n");
  78. exit(1);
  79. }
  80. }
  81. else {
  82. OFFLOAD_TRACE(3, "Cannot load libifcorert.dll\n");
  83. exit(1);
  84. }
  85. return 0;
  86. }
  87. #else // TARGET_WINNT
  88. #define FORTRAN_TRACE_BACK for__continue_traceback
  89. // for__continue_traceback is provided as a dummy to resolve link time symbols
  90. // for C/C++ programs. For Fortran the actual fortran library function in
  91. // libifcore.so is used.
  92. #pragma weak for__continue_traceback
  93. int for__continue_traceback( _Offload_result coi_offload_result )
  94. {
  95. OFFLOAD_TRACE(3,
  96. "liboffload function for_continue_traceback should not be called.\n");
  97. exit(1);
  98. }
  99. #endif //TARGET_WINNT
  100. } // extern "C"
  101. #ifdef TARGET_WINNT
  102. // Small subset of ELF declarations for Windows which is needed to compile
  103. // this file. ELF header is used to understand what binary type is contained
  104. // in the target image - shared library or executable.
  105. typedef uint16_t Elf64_Half;
  106. typedef uint32_t Elf64_Word;
  107. typedef uint64_t Elf64_Addr;
  108. typedef uint64_t Elf64_Off;
  109. #define EI_NIDENT 16
  110. #define ET_EXEC 2
  111. #define ET_DYN 3
  112. typedef struct
  113. {
  114. unsigned char e_ident[EI_NIDENT];
  115. Elf64_Half e_type;
  116. Elf64_Half e_machine;
  117. Elf64_Word e_version;
  118. Elf64_Addr e_entry;
  119. Elf64_Off e_phoff;
  120. Elf64_Off e_shoff;
  121. Elf64_Word e_flags;
  122. Elf64_Half e_ehsize;
  123. Elf64_Half e_phentsize;
  124. Elf64_Half e_phnum;
  125. Elf64_Half e_shentsize;
  126. Elf64_Half e_shnum;
  127. Elf64_Half e_shstrndx;
  128. } Elf64_Ehdr;
  129. #endif // TARGET_WINNT
  130. // Host console and file logging
  131. const char *prefix;
  132. int console_enabled = 0;
  133. int offload_number = 0;
  134. static const char *htrace_envname = "H_TRACE";
  135. static const char *offload_report_envname = "OFFLOAD_REPORT";
  136. static const char *timer_envname = "H_TIME";
  137. // DMA channel count used by COI and set via
  138. // OFFLOAD_DMA_CHANNEL_COUNT environment variable
  139. uint32_t mic_dma_channel_count;
  140. // Trace information
  141. static const char* vardesc_direction_as_string[] = {
  142. "NOCOPY",
  143. "IN",
  144. "OUT",
  145. "INOUT"
  146. };
  147. static const char* vardesc_type_as_string[] = {
  148. "unknown",
  149. "data",
  150. "data_ptr",
  151. "func_ptr",
  152. "void_ptr",
  153. "string_ptr",
  154. "dv",
  155. "dv_data",
  156. "dv_data_slice",
  157. "dv_ptr",
  158. "dv_ptr_data",
  159. "dv_ptr_data_slice",
  160. "cean_var",
  161. "cean_var_ptr",
  162. "c_data_ptr_array",
  163. "c_extended_type",
  164. "c_func_ptr_array",
  165. "c_void_ptr_array",
  166. "c_string_ptr_array",
  167. "c_data_ptr_ptr",
  168. "c_func_ptr_ptr",
  169. "c_void_ptr_ptr",
  170. "c_string_ptr_ptr",
  171. "c_cean_var_ptr_ptr",
  172. };
  173. Engine* mic_engines = 0;
  174. uint32_t mic_engines_total = 0;
  175. pthread_key_t mic_thread_key;
  176. MicEnvVar mic_env_vars;
  177. uint64_t cpu_frequency = 0;
  178. // MIC_STACKSIZE
  179. uint32_t mic_stack_size = 12 * 1024 * 1024;
  180. // MIC_BUFFERSIZE
  181. uint64_t mic_buffer_size = 0;
  182. // Preallocated 4K page memory size for buffers on MIC
  183. uint64_t mic_4k_buffer_size = 0;
  184. // Preallocated 2M page memory size for buffers on MIC
  185. uint64_t mic_2m_buffer_size = 0;
  186. // LD_LIBRARY_PATH for KNC
  187. char* knc_library_path = 0;
  188. // LD_LIBRARY_PATH for KNL
  189. char* knl_library_path = 0;
  190. // MIC_PROXY_IO
  191. bool mic_proxy_io = true;
  192. // MIC_PROXY_FS_ROOT
  193. char* mic_proxy_fs_root = 0;
  194. // Threshold for creating buffers with large pages. Buffer is created
  195. // with large pages hint if its size exceeds the threshold value.
  196. // By default large pages are disabled right now (by setting default
  197. // value for threshold to MAX) due to HSD 4114629.
  198. uint64_t __offload_use_2mb_buffers = 0xffffffffffffffffULL;
  199. static const char *mic_use_2mb_buffers_envname =
  200. "MIC_USE_2MB_BUFFERS";
  201. static uint64_t __offload_use_async_buffer_write = 2 * 1024 * 1024;
  202. static const char *mic_use_async_buffer_write_envname =
  203. "MIC_USE_ASYNC_BUFFER_WRITE";
  204. static uint64_t __offload_use_async_buffer_read = 2 * 1024 * 1024;
  205. static const char *mic_use_async_buffer_read_envname =
  206. "MIC_USE_ASYNC_BUFFER_READ";
  207. // device initialization type
  208. OffloadInitType __offload_init_type = c_init_on_offload_all;
  209. static const char *offload_init_envname = "OFFLOAD_INIT";
  210. // active wait
  211. static bool __offload_active_wait = true;
  212. static const char *offload_active_wait_envname = "OFFLOAD_ACTIVE_WAIT";
  213. // wait even for asynchronous offload
  214. // true for now still the performance issue with COI is not fixed
  215. static bool __offload_always_wait = true;
  216. static const char *offload_always_wait_envname = "OFFLOAD_ALWAYS_WAIT";
  217. // OMP_DEFAULT_DEVICE
  218. int __omp_device_num = 0;
  219. static const char *omp_device_num_envname = "OMP_DEFAULT_DEVICE";
  220. //OFFLOAD_PARALLEL_COPY
  221. static bool __offload_parallel_copy = false;
  222. static const char *parallel_copy_envname = "OFFLOAD_PARALLEL_COPY";
  223. //Use COI interface for noncontiguous transfer if it exists.
  224. static bool __offload_use_coi_noncontiguous_transfer = false;
  225. static const char *use_coi_noncontiguous_transfer_envname =
  226. "MIC_USE_COI_MULTI_D";
  227. // The list of pending target libraries
  228. static bool __target_libs;
  229. static TargetImageList __target_libs_list;
  230. static mutex_t __target_libs_lock;
  231. static mutex_t stack_alloc_lock;
  232. static mutex_t lock_complete;
  233. // Set of OffloadDescriptors of asynchronous offloads that are not destroyed
  234. std::map<void *, bool> offload_descr_map;
  235. // Target executable
  236. TargetImage* __target_exe;
  237. // is true if last loaded image is dll
  238. bool __current_image_is_dll = false;
  239. // is true if myo library is loaded when dll is loaded
  240. bool __myo_init_in_so = false;
  241. // Print readable offload flags
  242. static void trace_offload_flags(
  243. OffloadHostTimerData* timer_data,
  244. OffloadFlags offload_flags
  245. )
  246. {
  247. // Sized big enough for all flag names
  248. char fbuffer[256];
  249. bool first = true;
  250. if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) {
  251. sprintf(fbuffer, " OffloadFlags=(");
  252. if (offload_flags.bits.fortran_traceback) {
  253. sprintf(fbuffer+strlen(fbuffer), "fortran_traceback");
  254. first = false;
  255. }
  256. if (offload_flags.bits.omp_async) {
  257. sprintf(fbuffer+strlen(fbuffer), first ? "omp_async" : ",omp_async");
  258. first = false;
  259. }
  260. OFFLOAD_DEBUG_TRACE_1(1,
  261. GET_OFFLOAD_NUMBER(timer_data), c_offload_init_func,
  262. "%s)\n", fbuffer);
  263. }
  264. }
  265. // Print readable varDesc flags
  266. static void trace_varDesc_flags(
  267. OffloadHostTimerData* timer_data,
  268. varDescFlags offload_flags
  269. )
  270. {
  271. // Sized big enough for all flag names
  272. char fbuffer[256];
  273. bool first = true;
  274. if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) {
  275. sprintf(fbuffer, " varDescFlags=(");
  276. if (offload_flags.is_static) {
  277. sprintf(fbuffer+strlen(fbuffer), "is_static");
  278. first = false;
  279. }
  280. if (offload_flags.is_static_dstn) {
  281. sprintf(fbuffer+strlen(fbuffer),
  282. first ? "is_static_dstn" : ",is_static_dstn");
  283. first = false;
  284. }
  285. if (offload_flags.has_length) {
  286. sprintf(fbuffer+strlen(fbuffer),
  287. first ? "has_length" : ",has_length");
  288. first = false;
  289. }
  290. if (offload_flags.is_stack_buf) {
  291. sprintf(fbuffer+strlen(fbuffer),
  292. first ? "is_stack_buf" : ",is_stack_buf");
  293. first = false;
  294. }
  295. if (offload_flags.targetptr) {
  296. sprintf(fbuffer+strlen(fbuffer),
  297. first ? "targetptr" : ",targetptr");
  298. first = false;
  299. }
  300. if (offload_flags.preallocated) {
  301. sprintf(fbuffer+strlen(fbuffer),
  302. first ? "preallocated" : ",preallocated");
  303. first = false;
  304. }
  305. if (offload_flags.is_pointer) {
  306. sprintf(fbuffer+strlen(fbuffer),
  307. first ? "is_pointer" : ",is_pointer");
  308. first = false;
  309. }
  310. if (offload_flags.sink_addr) {
  311. sprintf(fbuffer+strlen(fbuffer),
  312. first ? "sink_addr" : ",sink_addr");
  313. first = false;
  314. }
  315. if (offload_flags.alloc_disp) {
  316. sprintf(fbuffer+strlen(fbuffer),
  317. first ? "alloc_disp" : ",alloc_disp");
  318. first = false;
  319. }
  320. if (offload_flags.is_noncont_src) {
  321. sprintf(fbuffer+strlen(fbuffer),
  322. first ? "is_noncont_src" : ",is_noncont_src");
  323. first = false;
  324. }
  325. if (offload_flags.is_noncont_dst) {
  326. sprintf(fbuffer+strlen(fbuffer),
  327. first ? "is_noncont_dst" : ",is_noncont_dst");
  328. first = false;
  329. }
  330. if (offload_flags.always_copy) {
  331. sprintf(fbuffer+strlen(fbuffer),
  332. first ? "always_copy" : ",always_copy");
  333. first = false;
  334. }
  335. if (offload_flags.always_delete) {
  336. sprintf(fbuffer+strlen(fbuffer),
  337. first ? "always_delete" : ",always_delete");
  338. first = false;
  339. }
  340. if (offload_flags.is_non_cont_struct) {
  341. sprintf(fbuffer+strlen(fbuffer),
  342. first ? "is_non_cont_struct" : ",is_non_cont_struct");
  343. first = false;
  344. }
  345. if (offload_flags.pin) {
  346. sprintf(fbuffer+strlen(fbuffer),
  347. first ? "pin" : ",pin");
  348. first = false;
  349. }
  350. if (offload_flags.is_device_ptr) {
  351. sprintf(fbuffer+strlen(fbuffer),
  352. first ? "is_device_ptr" : ",is_device_ptr");
  353. first = false;
  354. }
  355. if (offload_flags.use_device_ptr) {
  356. sprintf(fbuffer+strlen(fbuffer),
  357. first ? "use_device_ptr" : ",use_device_ptr");
  358. }
  359. OFFLOAD_DEBUG_TRACE_1(1,
  360. GET_OFFLOAD_NUMBER(timer_data), c_offload_init_func,
  361. "%s)\n", fbuffer);
  362. }
  363. }
  364. static char * offload_get_src_base(void * ptr, uint8_t type)
  365. {
  366. char *base;
  367. if (VAR_TYPE_IS_PTR(type)) {
  368. base = *static_cast<char**>(ptr);
  369. }
  370. else if (VAR_TYPE_IS_SCALAR(type)) {
  371. base = static_cast<char*>(ptr);
  372. }
  373. else if (VAR_TYPE_IS_DV_DATA_SLICE(type) || VAR_TYPE_IS_DV_DATA(type)) {
  374. ArrDesc *dvp;
  375. if (VAR_TYPE_IS_DV_DATA_SLICE(type)) {
  376. const Arr_Desc *ap = static_cast<const Arr_Desc*>(ptr);
  377. dvp = (type == c_dv_data_slice) ?
  378. reinterpret_cast<ArrDesc*>(ap->base) :
  379. *reinterpret_cast<ArrDesc**>(ap->base);
  380. }
  381. else {
  382. dvp = (type == c_dv_data) ?
  383. static_cast<ArrDesc*>(ptr) :
  384. *static_cast<ArrDesc**>(ptr);
  385. }
  386. base = reinterpret_cast<char*>(dvp->Base);
  387. }
  388. else {
  389. base = NULL;
  390. }
  391. return base;
  392. }
  393. void OffloadDescriptor::report_coi_error(error_types msg, COIRESULT res)
  394. {
  395. // special case for the 'process died' error
  396. if (res == COI_PROCESS_DIED) {
  397. m_device.fini_process(true);
  398. }
  399. else {
  400. switch (msg) {
  401. case c_buf_create:
  402. if (res == COI_OUT_OF_MEMORY) {
  403. msg = c_buf_create_out_of_mem;
  404. }
  405. /* fallthru */
  406. case c_buf_create_from_mem:
  407. case c_buf_get_address:
  408. case c_pipeline_create:
  409. case c_pipeline_run_func:
  410. LIBOFFLOAD_ERROR(msg, m_device.get_logical_index(), res);
  411. break;
  412. case c_buf_read:
  413. case c_buf_write:
  414. case c_buf_copy:
  415. case c_buf_map:
  416. case c_buf_unmap:
  417. case c_buf_destroy:
  418. case c_buf_set_state:
  419. LIBOFFLOAD_ERROR(msg, res);
  420. break;
  421. default:
  422. break;
  423. }
  424. }
  425. exit(1);
  426. }
  427. _Offload_result OffloadDescriptor::translate_coi_error(COIRESULT res) const
  428. {
  429. switch (res) {
  430. case COI_SUCCESS:
  431. return OFFLOAD_SUCCESS;
  432. case COI_PROCESS_DIED:
  433. return OFFLOAD_PROCESS_DIED;
  434. case COI_OUT_OF_MEMORY:
  435. return OFFLOAD_OUT_OF_MEMORY;
  436. default:
  437. return OFFLOAD_ERROR;
  438. }
  439. }
  440. // is_targetptr == 0 && is_prealloc == 0 - allocation of pointer data;
  441. // is_targetptr == 1 && is_prealloc == 0 - allocation of target memory:
  442. // allocate memory at target; use its value as base in target table.
  443. // is_targetptr == 1 && is_prealloc == 1 - use preallocated target memory:
  444. // base - is address at target of preallocated memory; use its value as
  445. // base in target table.
  446. bool OffloadDescriptor::alloc_ptr_data(
  447. PtrData* &ptr_data,
  448. void *base,
  449. int64_t disp,
  450. int64_t size,
  451. int64_t alloc_disp,
  452. int align,
  453. bool is_targptr,
  454. bool is_prealloc,
  455. bool pin
  456. )
  457. {
  458. // total length of base
  459. int64_t length = size;
  460. bool is_new;
  461. COIBUFFER targptr_buf;
  462. COIRESULT res;
  463. uint32_t buffer_flags = 0;
  464. char * base_disp = reinterpret_cast<char *>(base) + disp;
  465. // create buffer with large pages if data length exceeds
  466. // large page threshold
  467. if (length >= __offload_use_2mb_buffers) {
  468. buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE;
  469. }
  470. // Allocate memory at target for targetptr without preallocated as we need
  471. // its address as base argument in call to m_device.insert_ptr_data
  472. if (is_targptr && !is_prealloc) {
  473. length = alloc_disp ? length : size + disp;
  474. res = COI::BufferCreate(
  475. length,
  476. COI_BUFFER_OPENCL,
  477. buffer_flags,
  478. 0,
  479. 1,
  480. &m_device.get_process(),
  481. &targptr_buf);
  482. if (res != COI_SUCCESS) {
  483. if (m_status != 0) {
  484. m_status->result = translate_coi_error(res);
  485. }
  486. else if (m_is_mandatory) {
  487. report_coi_error(c_buf_create, res);
  488. }
  489. return false;
  490. }
  491. res = COI::BufferGetSinkAddress(
  492. targptr_buf, reinterpret_cast<uint64_t *>(&base));
  493. if (res != COI_SUCCESS) {
  494. if (m_status != 0) {
  495. m_status->result = translate_coi_error(res);
  496. }
  497. else if (m_is_mandatory) {
  498. report_coi_error(c_buf_get_address, res);
  499. }
  500. return false;
  501. }
  502. }
  503. OFFLOAD_TRACE(3, "Creating association for data: addr %p, length %lld\n",
  504. alloc_disp ? base : base_disp,
  505. alloc_disp ? length : size + disp);
  506. // add new entry
  507. ptr_data = is_targptr ?
  508. m_device.find_targetptr_data(base_disp) :
  509. m_device.find_ptr_data(base_disp);
  510. // if ptr_data is found just need to check it for overlapping
  511. if (ptr_data) {
  512. is_new = false;
  513. base = base_disp;
  514. }
  515. else {
  516. // If association is not found we must create it.
  517. length = alloc_disp ? length : size + disp;
  518. ptr_data = is_targptr ?
  519. m_device.insert_targetptr_data(base, length, is_new) :
  520. m_device.insert_ptr_data(base, length, is_new);
  521. }
  522. if (is_new) {
  523. OFFLOAD_TRACE(3, "Added new association\n");
  524. if (length > 0) {
  525. OffloadTimer timer(get_timer_data(), c_offload_host_alloc_buffers);
  526. // align should be a power of 2
  527. if (!pin && !is_targptr &&
  528. align > 0 && (align & (align - 1)) == 0) {
  529. // offset within mic_buffer. Can do offset optimization
  530. // only when source address alignment satisfies requested
  531. // alignment on the target (cq172736).
  532. if ((reinterpret_cast<intptr_t>(base) & (align - 1)) == 0) {
  533. ptr_data->mic_offset =
  534. reinterpret_cast<intptr_t>(base) & 4095;
  535. }
  536. }
  537. // buffer size and flags
  538. uint64_t buffer_size = length + ptr_data->mic_offset;
  539. // For targetptr there is no CPU buffer
  540. if (pin || !is_targptr) {
  541. // create CPU buffer
  542. OFFLOAD_DEBUG_TRACE_1(3,
  543. GET_OFFLOAD_NUMBER(get_timer_data()),
  544. c_offload_create_buf_host,
  545. "Creating buffer from source memory %p, "
  546. "length %lld\n", base, length);
  547. // result is not checked because we can continue without cpu
  548. // buffer. In this case we will use COIBufferRead/Write
  549. // instead of COIBufferCopy.
  550. COI::BufferCreateFromMemory(length,
  551. COI_BUFFER_OPENCL,
  552. 0,
  553. base,
  554. 1,
  555. &m_device.get_process(),
  556. &ptr_data->cpu_buf);
  557. }
  558. // create MIC buffer
  559. if (is_prealloc) {
  560. OFFLOAD_DEBUG_TRACE_1(3,
  561. GET_OFFLOAD_NUMBER(get_timer_data()),
  562. c_offload_create_buf_mic,
  563. "Creating buffer from sink memory: "
  564. "addr %p, size %lld, offset %d, flags 0x%x\n",
  565. base, buffer_size, ptr_data->mic_offset,
  566. buffer_flags);
  567. res = COI::BufferCreateFromMemory(ptr_data->cpu_addr.length(),
  568. COI_BUFFER_NORMAL,
  569. COI_SINK_MEMORY,
  570. base,
  571. 1,
  572. &m_device.get_process(),
  573. &ptr_data->mic_buf);
  574. if (res != COI_SUCCESS) {
  575. if (m_status != 0) {
  576. m_status->result = translate_coi_error(res);
  577. }
  578. else if (m_is_mandatory) {
  579. report_coi_error(c_buf_create, res);
  580. }
  581. ptr_data->alloc_ptr_data_lock.unlock();
  582. return false;
  583. }
  584. }
  585. else if (is_targptr) {
  586. ptr_data->mic_buf = targptr_buf;
  587. }
  588. else if (!pin) {
  589. OFFLOAD_DEBUG_TRACE_1(3,
  590. GET_OFFLOAD_NUMBER(get_timer_data()),
  591. c_offload_create_buf_mic,
  592. "Creating buffer for sink: size %lld, offset %d, "
  593. "flags =0x%x\n", buffer_size,
  594. ptr_data->mic_offset, buffer_flags);
  595. res = COI::BufferCreate(buffer_size,
  596. COI_BUFFER_NORMAL,
  597. buffer_flags,
  598. 0,
  599. 1,
  600. &m_device.get_process(),
  601. &ptr_data->mic_buf);
  602. if (res != COI_SUCCESS) {
  603. if (m_status != 0) {
  604. m_status->result = translate_coi_error(res);
  605. }
  606. else if (m_is_mandatory) {
  607. report_coi_error(c_buf_create, res);
  608. }
  609. ptr_data->alloc_ptr_data_lock.unlock();
  610. return false;
  611. }
  612. }
  613. if (!pin) {
  614. // make buffer valid on the device.
  615. res = COI::BufferSetState(ptr_data->mic_buf,
  616. m_device.get_process(),
  617. COI_BUFFER_VALID,
  618. COI_BUFFER_NO_MOVE,
  619. 0, 0, 0);
  620. if (res != COI_SUCCESS) {
  621. if (m_status != 0) {
  622. m_status->result = translate_coi_error(res);
  623. }
  624. else if (m_is_mandatory) {
  625. report_coi_error(c_buf_set_state, res);
  626. }
  627. ptr_data->alloc_ptr_data_lock.unlock();
  628. return false;
  629. }
  630. res = COI::BufferSetState(ptr_data->mic_buf,
  631. COI_PROCESS_SOURCE,
  632. COI_BUFFER_INVALID,
  633. COI_BUFFER_NO_MOVE,
  634. 0, 0, 0);
  635. if (res != COI_SUCCESS) {
  636. if (m_status != 0) {
  637. m_status->result = translate_coi_error(res);
  638. }
  639. else if (m_is_mandatory) {
  640. report_coi_error(c_buf_set_state, res);
  641. }
  642. ptr_data->alloc_ptr_data_lock.unlock();
  643. return false;
  644. }
  645. }
  646. }
  647. ptr_data->alloc_disp = alloc_disp;
  648. ptr_data->alloc_ptr_data_lock.unlock();
  649. }
  650. else {
  651. mutex_locker_t locker(ptr_data->alloc_ptr_data_lock);
  652. OFFLOAD_TRACE(3, "Found existing association: addr %p, length %lld, "
  653. "is_static %d\n",
  654. ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
  655. ptr_data->is_static);
  656. // This is not a new entry. Make sure that provided address range fits
  657. // into existing one.
  658. MemRange addr_range(base, length);
  659. if (!ptr_data->cpu_addr.contains(addr_range)) {
  660. LIBOFFLOAD_ERROR(c_bad_ptr_mem_alloc, base, length,
  661. const_cast<void *>(ptr_data->cpu_addr.start()),
  662. ptr_data->cpu_addr.length());
  663. exit(1);
  664. }
  665. // if the entry is associated with static data it may not have buffers
  666. // created because they are created on demand.
  667. if (ptr_data->is_static && !init_static_ptr_data(ptr_data)) {
  668. return false;
  669. }
  670. }
  671. return true;
  672. }
  673. bool OffloadDescriptor::find_ptr_data(
  674. PtrData* &ptr_data,
  675. void *in_base,
  676. int64_t disp,
  677. int64_t size,
  678. bool is_targetptr,
  679. bool report_error
  680. )
  681. {
  682. // total length of base
  683. int64_t length = size;
  684. char *base = reinterpret_cast<char *>(in_base) + disp;
  685. OFFLOAD_TRACE(3, "Looking for association for data: addr %p, "
  686. "length %lld\n", base, length);
  687. // find existing association in pointer table
  688. ptr_data = is_targetptr ?
  689. m_device.find_targetptr_data(base) :
  690. m_device.find_ptr_data(base);
  691. if (ptr_data == 0) {
  692. if (report_error) {
  693. LIBOFFLOAD_ERROR(c_no_ptr_data, base);
  694. exit(1);
  695. }
  696. OFFLOAD_TRACE(3, "Association does not exist\n");
  697. return true;
  698. }
  699. OFFLOAD_TRACE(3, "Found association: base %p, length %lld, is_static %d\n",
  700. ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
  701. ptr_data->is_static);
  702. // make sure that provided address range fits into existing one
  703. MemRange addr_range(base, length);
  704. if (!ptr_data->cpu_addr.contains(addr_range)) {
  705. if (report_error) {
  706. LIBOFFLOAD_ERROR(c_bad_ptr_mem_range, base, length,
  707. const_cast<void *>(ptr_data->cpu_addr.start()),
  708. ptr_data->cpu_addr.length());
  709. exit(1);
  710. }
  711. OFFLOAD_TRACE(3, "Existing association partially overlaps with "
  712. "data address range\n");
  713. ptr_data = 0;
  714. return true;
  715. }
  716. // if the entry is associated with static data it may not have buffers
  717. // created because they are created on demand.
  718. if (ptr_data->is_static && !init_static_ptr_data(ptr_data)) {
  719. return false;
  720. }
  721. return true;
  722. }
  723. void OffloadDescriptor::find_device_ptr(
  724. int64_t* &device_ptr,
  725. void *host_ptr
  726. )
  727. {
  728. PtrData* ptr_data;
  729. char *base = reinterpret_cast<char *>(host_ptr);
  730. OFFLOAD_TRACE(3, "Looking for association for data: addr %p\n", base);
  731. // find existing association in pointer table
  732. ptr_data = m_device.find_ptr_data(base);
  733. // MIC address should have been assigned.
  734. // For now assume does not exist and get the addr
  735. // if ((ptr_data == 0) || ptr_data->mic_addr) {
  736. if (ptr_data == 0) {
  737. OFFLOAD_TRACE(3, "Association does not exist\n");
  738. LIBOFFLOAD_ERROR(c_no_ptr_data, base);
  739. exit(1);
  740. }
  741. if (!ptr_data->mic_addr) {
  742. COIRESULT res = COI::BufferGetSinkAddress(ptr_data->mic_buf,
  743. &ptr_data->mic_addr);
  744. if (res != COI_SUCCESS) {
  745. if (m_status != 0)
  746. m_status->result = translate_coi_error(res);
  747. report_coi_error(c_buf_get_address, res);
  748. }
  749. }
  750. device_ptr = (int64_t *) ptr_data->mic_addr;
  751. OFFLOAD_TRACE(3, "Found association: host_ptr %p, device_ptr = %p\n",
  752. ptr_data->cpu_addr.start(), device_ptr);
  753. }
  754. bool OffloadDescriptor::init_static_ptr_data(PtrData *ptr_data)
  755. {
  756. OffloadTimer timer(get_timer_data(), c_offload_host_alloc_buffers);
  757. if (ptr_data->cpu_buf == 0) {
  758. OFFLOAD_TRACE(3, "Creating buffer from source memory %llx\n",
  759. ptr_data->cpu_addr.start());
  760. COIRESULT res = COI::BufferCreateFromMemory(
  761. ptr_data->cpu_addr.length(),
  762. COI_BUFFER_OPENCL,
  763. 0,
  764. const_cast<void*>(ptr_data->cpu_addr.start()),
  765. 1, &m_device.get_process(),
  766. &ptr_data->cpu_buf);
  767. if (res != COI_SUCCESS) {
  768. if (m_status != 0) {
  769. m_status->result = translate_coi_error(res);
  770. return false;
  771. }
  772. report_coi_error(c_buf_create_from_mem, res);
  773. }
  774. }
  775. if (ptr_data->mic_buf == 0) {
  776. OFFLOAD_TRACE(3, "Creating buffer from sink memory %llx\n",
  777. ptr_data->mic_addr);
  778. COIRESULT res = COI::BufferCreateFromMemory(
  779. ptr_data->cpu_addr.length(),
  780. COI_BUFFER_NORMAL,
  781. COI_SINK_MEMORY,
  782. reinterpret_cast<void*>(ptr_data->mic_addr),
  783. 1, &m_device.get_process(),
  784. &ptr_data->mic_buf);
  785. if (res != COI_SUCCESS) {
  786. if (m_status != 0) {
  787. m_status->result = translate_coi_error(res);
  788. return false;
  789. }
  790. report_coi_error(c_buf_create_from_mem, res);
  791. }
  792. }
  793. return true;
  794. }
  795. bool OffloadDescriptor::init_mic_address(PtrData *ptr_data)
  796. {
  797. if (ptr_data->mic_buf != 0 && ptr_data->mic_addr == 0) {
  798. COIRESULT res = COI::BufferGetSinkAddress(ptr_data->mic_buf,
  799. &ptr_data->mic_addr);
  800. if (res != COI_SUCCESS) {
  801. if (m_status != 0) {
  802. m_status->result = translate_coi_error(res);
  803. }
  804. else if (m_is_mandatory) {
  805. report_coi_error(c_buf_get_address, res);
  806. }
  807. return false;
  808. }
  809. }
  810. return true;
  811. }
  812. bool OffloadDescriptor::nullify_target_stack(
  813. COIBUFFER targ_buf,
  814. uint64_t size
  815. )
  816. {
  817. char * ptr = (char*)malloc(size);
  818. if (ptr == NULL)
  819. LIBOFFLOAD_ERROR(c_malloc);
  820. COIRESULT res;
  821. memset(ptr, 0, size);
  822. res = COI::BufferWrite(
  823. targ_buf,
  824. 0,
  825. ptr,
  826. size,
  827. COI_COPY_UNSPECIFIED,
  828. 0, 0, 0);
  829. free(ptr);
  830. if (res != COI_SUCCESS) {
  831. if (m_status != 0) {
  832. m_status->result = translate_coi_error(res);
  833. return false;
  834. }
  835. report_coi_error(c_buf_write, res);
  836. }
  837. return true;
  838. }
  839. static void print_persistList_item(
  840. const char *msg,
  841. PersistData *cur_el
  842. )
  843. {
  844. OFFLOAD_TRACE(4, "%s\n", msg);
  845. OFFLOAD_TRACE(4, " stack_cpu_addr = %p\n", cur_el->stack_cpu_addr);
  846. OFFLOAD_TRACE(4, " routine_id = %d\n", cur_el->routine_id);
  847. OFFLOAD_TRACE(4, " thread_id = %lld\n", cur_el->thread_id);
  848. OFFLOAD_TRACE(4, " stack_ptr_data = %p\n", cur_el->stack_ptr_data);
  849. OFFLOAD_TRACE(4, " MIC buffer = %p\n", cur_el->stack_ptr_data->mic_buf);
  850. OFFLOAD_TRACE(4, " MIC addr = %p\n", cur_el->stack_ptr_data->mic_addr);
  851. OFFLOAD_TRACE(4, " cpu_stack_addr = %p\n", cur_el->cpu_stack_addr);
  852. }
  853. static mutex_t stack_memory_manager_lock;
  854. bool OffloadDescriptor::offload_stack_memory_manager(
  855. const void * stack_begin,
  856. int routine_id,
  857. int buf_size,
  858. int align,
  859. bool thread_specific_function_locals,
  860. bool *is_new)
  861. {
  862. //mutex_locker_t locker(stack_alloc_lock);
  863. stack_memory_manager_lock.lock();
  864. PersistData * new_el;
  865. PersistDataList::iterator it_begin = m_device.m_persist_list.begin();
  866. PersistDataList::iterator it_end;
  867. int erase = 0;
  868. uint64_t cur_thread_id = m_device.get_thread_id();
  869. OFFLOAD_TRACE(3, "offload_stack_memory_manager("
  870. "stack_begin=%p, routine_id=%d, buf_size=%d,"
  871. "align=%d, thread_specific_function_locals=%d, bool=%p)\n",
  872. stack_begin, routine_id, buf_size,
  873. align, thread_specific_function_locals, is_new);
  874. OFFLOAD_TRACE(3, "cur_thread_id=%lld\n", cur_thread_id);
  875. *is_new = false;
  876. for (PersistDataList::iterator it = m_device.m_persist_list.begin();
  877. it != m_device.m_persist_list.end(); it++) {
  878. PersistData cur_el = *it;
  879. print_persistList_item("Current element in persist list:", &cur_el);
  880. if (stack_begin > it->stack_cpu_addr) {
  881. if (cur_thread_id == cur_el.thread_id) {
  882. // this stack data must be destroyed
  883. m_destroy_stack.push_front(cur_el.stack_ptr_data);
  884. it_end = it;
  885. erase++;
  886. OFFLOAD_TRACE(3, "Current element below TOS: so delete\n");
  887. }
  888. }
  889. else if (stack_begin == it->stack_cpu_addr) {
  890. if (routine_id != it-> routine_id) {
  891. // this stack data must be destroyed
  892. // because the current function is a dynamic sibling
  893. m_destroy_stack.push_front(cur_el.stack_ptr_data);
  894. it_end = it;
  895. erase++;
  896. OFFLOAD_TRACE(3, "Current element is sibling: so delete\n");
  897. break;
  898. }
  899. else if (!thread_specific_function_locals ||
  900. cur_thread_id == cur_el.thread_id) {
  901. // stack data is reused
  902. m_stack_ptr_data = it->stack_ptr_data;
  903. if (erase > 0) {
  904. // all obsolete stack sections must be erased from the list
  905. m_device.m_persist_list.erase(it_begin, ++it_end);
  906. m_in_datalen +=
  907. erase * sizeof(new_el->stack_ptr_data->mic_addr);
  908. }
  909. OFFLOAD_TRACE(3, "Reuse of stack buffer with addr %p\n",
  910. m_stack_ptr_data->mic_addr);
  911. stack_memory_manager_lock.unlock();
  912. return true;
  913. }
  914. }
  915. else if (stack_begin < it->stack_cpu_addr &&
  916. cur_thread_id == cur_el.thread_id) {
  917. OFFLOAD_TRACE(3, "Current element is above TOS\n");
  918. break;
  919. }
  920. }
  921. if (erase > 0) {
  922. // all obsolete stack sections must be erased from the list
  923. m_device.m_persist_list.erase(it_begin, ++it_end);
  924. m_in_datalen += erase * sizeof(new_el->stack_ptr_data->mic_addr);
  925. }
  926. // new stack table is created
  927. new_el = new PersistData(stack_begin, routine_id, buf_size, cur_thread_id);
  928. // create MIC buffer
  929. COIRESULT res;
  930. uint32_t buffer_flags = 0;
  931. // create buffer with large pages if data length exceeds
  932. // large page threshold
  933. if (buf_size >= __offload_use_2mb_buffers) {
  934. buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE;
  935. }
  936. res = COI::BufferCreate(buf_size,
  937. COI_BUFFER_NORMAL,
  938. buffer_flags,
  939. 0,
  940. 1,
  941. &m_device.get_process(),
  942. &new_el->stack_ptr_data->mic_buf);
  943. if (res != COI_SUCCESS) {
  944. if (m_status != 0) {
  945. m_status->result = translate_coi_error(res);
  946. }
  947. else if (m_is_mandatory) {
  948. report_coi_error(c_buf_create, res);
  949. }
  950. stack_memory_manager_lock.unlock();
  951. return false;
  952. }
  953. // make buffer valid on the device.
  954. res = COI::BufferSetState(new_el->stack_ptr_data->mic_buf,
  955. m_device.get_process(),
  956. COI_BUFFER_VALID,
  957. COI_BUFFER_NO_MOVE,
  958. 0, 0, 0);
  959. if (res != COI_SUCCESS) {
  960. if (m_status != 0) {
  961. m_status->result = translate_coi_error(res);
  962. }
  963. else if (m_is_mandatory) {
  964. report_coi_error(c_buf_set_state, res);
  965. }
  966. stack_memory_manager_lock.unlock();
  967. return false;
  968. }
  969. res = COI::BufferSetState(new_el->stack_ptr_data->mic_buf,
  970. COI_PROCESS_SOURCE,
  971. COI_BUFFER_INVALID,
  972. COI_BUFFER_NO_MOVE,
  973. 0, 0, 0);
  974. if (res != COI_SUCCESS) {
  975. if (m_status != 0) {
  976. m_status->result = translate_coi_error(res);
  977. }
  978. else if (m_is_mandatory) {
  979. report_coi_error(c_buf_set_state, res);
  980. }
  981. stack_memory_manager_lock.unlock();
  982. return false;
  983. }
  984. // persistence algorithm requires target stack initialy to be nullified
  985. if (!nullify_target_stack(new_el->stack_ptr_data->mic_buf, buf_size)) {
  986. stack_memory_manager_lock.unlock();
  987. return false;
  988. }
  989. m_stack_ptr_data = new_el->stack_ptr_data;
  990. init_mic_address(m_stack_ptr_data);
  991. OFFLOAD_TRACE(3, "Allocating stack buffer with addr %p\n",
  992. m_stack_ptr_data->mic_addr);
  993. m_device.m_persist_list.push_front(*new_el);
  994. init_mic_address(new_el->stack_ptr_data);
  995. *is_new = true;
  996. stack_memory_manager_lock.unlock();
  997. return true;
  998. }
  999. // Search through persistent stack buffers
  1000. // for the top-of-stack buffer for this thread
  1001. char* OffloadDescriptor::get_this_threads_cpu_stack_addr(
  1002. const void * stack_begin,
  1003. int routine_id,
  1004. bool thread_specific_function_locals
  1005. )
  1006. {
  1007. uint64_t cur_thread_id = m_device.get_thread_id();
  1008. char* matched = 0;
  1009. OFFLOAD_TRACE(3, "get_this_threads_cpu_stack_addr("
  1010. "stack_begin=%p, routine_id=%d, thread_specific_function_locals=%d)\n",
  1011. stack_begin, routine_id, thread_specific_function_locals);
  1012. OFFLOAD_TRACE(3, "cur_thread_id=%lld\n", cur_thread_id);
  1013. stack_memory_manager_lock.lock();
  1014. for (PersistDataList::iterator it = m_device.m_persist_list.begin();
  1015. it != m_device.m_persist_list.end(); it++)
  1016. {
  1017. PersistData cur_el = *it;
  1018. print_persistList_item("Current element in persist list:", &cur_el);
  1019. if (stack_begin == cur_el.stack_cpu_addr)
  1020. {
  1021. // For OpenMP shared function locals matching is done without
  1022. // regard to thread id. But, we return the last match, which
  1023. // corresponds to the outer stack.
  1024. if (!thread_specific_function_locals)
  1025. {
  1026. matched = cur_el.cpu_stack_addr;
  1027. continue;
  1028. }
  1029. // For non-OpenMP shared function-local variables
  1030. // the thread-id must match
  1031. if (cur_thread_id == cur_el.thread_id)
  1032. {
  1033. matched = cur_el.cpu_stack_addr;
  1034. break;
  1035. }
  1036. }
  1037. }
  1038. stack_memory_manager_lock.unlock();
  1039. if (matched != 0)
  1040. {
  1041. OFFLOAD_TRACE(3, "get_this_threads_cpu_stack_addr() => %p\n", matched);
  1042. return matched;
  1043. }
  1044. OFFLOAD_TRACE(1,
  1045. "Could not find persistent data; expect Read/Write failure\n");
  1046. return 0;
  1047. }
  1048. // Search through persistent stack buffers
  1049. // for the top-of-stack MIC buffer for this thread
  1050. PtrData* OffloadDescriptor::get_this_threads_mic_stack_addr(
  1051. const void * stack_begin,
  1052. int routine_id,
  1053. bool thread_specific_function_locals
  1054. )
  1055. {
  1056. uint64_t cur_thread_id = m_device.get_thread_id();
  1057. PtrData* matched = 0;
  1058. OFFLOAD_TRACE(3, "get_this_threads_mic_stack_addr("
  1059. "stack_begin=%p, routine_id=%d, thread_specific_function_locals=%d)\n",
  1060. stack_begin, routine_id, thread_specific_function_locals);
  1061. OFFLOAD_TRACE(3, "cur_thread_id=%lld\n", cur_thread_id);
  1062. stack_memory_manager_lock.lock();
  1063. for (PersistDataList::iterator it = m_device.m_persist_list.begin();
  1064. it != m_device.m_persist_list.end(); it++)
  1065. {
  1066. PersistData cur_el = *it;
  1067. print_persistList_item("Current element in persist list:", &cur_el);
  1068. if (stack_begin == cur_el.stack_cpu_addr)
  1069. {
  1070. // For OpenMP shared function locals matching is done without
  1071. // regard to thread id. But, we return the last match, which
  1072. // corresponds to the outer stack.
  1073. if (!thread_specific_function_locals)
  1074. {
  1075. matched = cur_el.stack_ptr_data;
  1076. continue;
  1077. }
  1078. // For non-OpenMP shared function-local variables
  1079. // the thread-id must match
  1080. if (cur_thread_id == cur_el.thread_id)
  1081. {
  1082. matched = cur_el.stack_ptr_data;
  1083. break;
  1084. }
  1085. }
  1086. }
  1087. stack_memory_manager_lock.unlock();
  1088. if (matched != 0)
  1089. {
  1090. OFFLOAD_TRACE(3, "get_this_threads_mic_stack_addr() => %p\n", matched);
  1091. return matched;
  1092. }
  1093. OFFLOAD_TRACE(1,
  1094. "Could not find persistent data; expect Read/Write failure\n");
  1095. return 0;
  1096. }
  1097. void OffloadDescriptor::setup_use_device_ptr(int i)
  1098. {
  1099. PtrData *ptr_data;
  1100. ArrDesc *dvp;
  1101. void *base;
  1102. if (m_vars_extra[i].type_src == c_dv_ptr) {
  1103. dvp = *static_cast<ArrDesc**>(m_vars[i].ptr);
  1104. base = reinterpret_cast<void*>(dvp->Base);
  1105. }
  1106. else {
  1107. base = *static_cast<void**>(m_vars[i].ptr);
  1108. }
  1109. if (m_vars[i].direction.in) {
  1110. int64_t *device_ptr;
  1111. bool is_new = true;
  1112. find_device_ptr(device_ptr, base);
  1113. // Create a entry in targetptr table using device_ptr
  1114. // as lookup for later recover the host pointer
  1115. ptr_data = m_device.insert_targetptr_data(device_ptr,
  1116. 0, is_new);
  1117. // Actually the base is a host pointer and cpu_addr is
  1118. // device pointer. This is special case where the 2
  1119. // address usage is reversed to enable using existing
  1120. // PtrData structure instead of adding new fields.
  1121. ptr_data->mic_addr = (uint64_t) base;
  1122. ptr_data->alloc_ptr_data_lock.unlock();
  1123. // Replace host pointer with device pointer
  1124. if (m_vars_extra[i].type_src == c_dv_ptr) {
  1125. dvp->Base = reinterpret_cast<dv_size>(device_ptr);
  1126. }
  1127. else {
  1128. *static_cast<void**>(m_vars[i].ptr) = device_ptr;
  1129. }
  1130. }
  1131. else if (m_vars[i].direction.out) {
  1132. // For use_device_ptr and out find associated host ptr
  1133. // and assign to host ptr
  1134. ptr_data = m_device.find_targetptr_data(base);
  1135. if (!ptr_data) {
  1136. LIBOFFLOAD_ERROR(c_no_ptr_data, base);
  1137. exit(1);
  1138. }
  1139. if (m_vars_extra[i].type_src == c_dv_ptr) {
  1140. dvp->Base = ptr_data->mic_addr;
  1141. }
  1142. else {
  1143. *static_cast<void**>(m_vars[i].ptr) =
  1144. reinterpret_cast<void*>(ptr_data->mic_addr);
  1145. }
  1146. m_device.remove_targetptr_data(
  1147. ptr_data->cpu_addr.start());
  1148. }
  1149. }
  1150. bool OffloadDescriptor::setup_descriptors(
  1151. VarDesc *vars,
  1152. VarDesc2 *vars2,
  1153. int vars_total,
  1154. int entry_id,
  1155. const void *stack_addr
  1156. )
  1157. {
  1158. COIRESULT res;
  1159. // To enable caching the CPU stack base address for stack variables
  1160. char* this_threads_cpu_stack_addr = 0;
  1161. // To properly deal with non-OpenMP threading and function-local variables
  1162. // For OpenMP threading we support all function-locals in shared mode only
  1163. bool thread_specific_function_locals = !omp_in_parallel();
  1164. OffloadTimer timer(get_timer_data(), c_offload_host_setup_buffers);
  1165. // make a copy of variable descriptors
  1166. m_vars_total = vars_total;
  1167. if (vars_total > 0) {
  1168. m_vars = (VarDesc*) malloc(m_vars_total * sizeof(VarDesc));
  1169. if (m_vars == NULL)
  1170. LIBOFFLOAD_ERROR(c_malloc);
  1171. memcpy(m_vars, vars, m_vars_total * sizeof(VarDesc));
  1172. m_vars_extra = (VarExtra*) malloc(m_vars_total * sizeof(VarExtra));
  1173. if (m_vars_extra == NULL)
  1174. LIBOFFLOAD_ERROR(c_malloc);
  1175. }
  1176. // dependencies
  1177. m_in_deps_allocated = m_vars_total + 1;
  1178. m_in_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * m_in_deps_allocated);
  1179. if (m_in_deps == NULL)
  1180. LIBOFFLOAD_ERROR(c_malloc);
  1181. if (m_vars_total > 0) {
  1182. m_out_deps_allocated = m_vars_total;
  1183. m_out_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * m_out_deps_allocated);
  1184. if (m_out_deps == NULL)
  1185. LIBOFFLOAD_ERROR(c_malloc);
  1186. }
  1187. // copyin/copyout data length
  1188. m_in_datalen = 0;
  1189. m_out_datalen = 0;
  1190. // First pass over variable descriptors
  1191. // - Calculate size of the input and output non-pointer data
  1192. // - Allocate buffers for input and output pointers
  1193. for (int i = 0; i < m_vars_total; i++) {
  1194. void* alloc_base = NULL;
  1195. int64_t alloc_disp = 0;
  1196. int64_t alloc_size = 0;
  1197. bool src_is_for_mic = (m_vars[i].direction.out ||
  1198. m_vars[i].into == NULL);
  1199. bool src_is_for_host = (m_vars[i].direction.in ||
  1200. m_vars[i].into == NULL);
  1201. const char *var_sname = "";
  1202. if (vars2 != NULL && i < vars_total) {
  1203. if (vars2[i].sname != NULL) {
  1204. var_sname = vars2[i].sname;
  1205. }
  1206. }
  1207. // instead of m_vars[i].type.src we will use m_vars_extra[i].type_src
  1208. if (m_vars[i].type.src == c_extended_type) {
  1209. VarDescExtendedType *etype =
  1210. reinterpret_cast<VarDescExtendedType*>(m_vars[i].ptr);
  1211. m_vars_extra[i].type_src = etype->extended_type;
  1212. m_vars[i].ptr = etype->ptr;
  1213. }
  1214. else {
  1215. m_vars_extra[i].type_src = m_vars[i].type.src;
  1216. }
  1217. // instead of m_vars[i].type.dst we will use m_vars_extra[i].type_dst
  1218. if (m_vars[i].type.dst == c_extended_type) {
  1219. VarDescExtendedType *etype =
  1220. reinterpret_cast<VarDescExtendedType*>(m_vars[i].into);
  1221. if (etype) {
  1222. m_vars_extra[i].type_dst = etype->extended_type;
  1223. m_vars[i].into = etype->ptr;
  1224. }
  1225. else {
  1226. m_vars_extra[i].type_dst = m_vars_extra[i].type_src;
  1227. }
  1228. }
  1229. else {
  1230. m_vars_extra[i].type_dst = m_vars[i].type.dst;
  1231. }
  1232. OFFLOAD_TRACE(2, " VarDesc %d, var=%s, %s, %s\n",
  1233. i, var_sname,
  1234. vardesc_direction_as_string[m_vars[i].direction.bits],
  1235. vardesc_type_as_string[m_vars_extra[i].type_src]);
  1236. if (vars2 != NULL && i < vars_total && vars2[i].dname != NULL) {
  1237. OFFLOAD_TRACE(2, " into=%s, %s\n", vars2[i].dname,
  1238. vardesc_type_as_string[m_vars_extra[i].type_dst]);
  1239. }
  1240. OFFLOAD_TRACE(2,
  1241. " type_src=%d, type_dstn=%d, direction=%d, "
  1242. "alloc_if=%d, free_if=%d, align=%d, mic_offset=%d, flags=0x%x, "
  1243. "offset=%lld, size=%lld, count/disp=%lld, ptr=%p, into=%p\n",
  1244. m_vars_extra[i].type_src,
  1245. m_vars_extra[i].type_dst,
  1246. m_vars[i].direction.bits,
  1247. m_vars[i].alloc_if,
  1248. m_vars[i].free_if,
  1249. m_vars[i].align,
  1250. m_vars[i].mic_offset,
  1251. m_vars[i].flags.bits,
  1252. m_vars[i].offset,
  1253. m_vars[i].size,
  1254. m_vars[i].count,
  1255. m_vars[i].ptr,
  1256. m_vars[i].into);
  1257. // If any varDesc flags bits set, show them
  1258. if (console_enabled >= 1 && m_vars[i].flags.bits != 0) {
  1259. trace_varDesc_flags(get_timer_data(), m_vars[i].flags);
  1260. }
  1261. // preallocated implies targetptr
  1262. if (m_vars[i].flags.preallocated) {
  1263. // targetptr preallocated alloc_if(1) may not be used with
  1264. // an in clause
  1265. if (m_vars[i].direction.in && m_vars[i].alloc_if) {
  1266. LIBOFFLOAD_ERROR(c_in_with_preallocated);
  1267. exit(1);
  1268. }
  1269. m_vars[i].flags.targetptr = 1;
  1270. }
  1271. if (m_vars[i].alloc != NULL) {
  1272. // array descriptor
  1273. const Arr_Desc *ap =
  1274. static_cast<const Arr_Desc*>(m_vars[i].alloc);
  1275. // debug dump
  1276. ARRAY_DESC_DUMP(" ", "ALLOC", ap, 0, 1);
  1277. __arr_data_offset_and_length(ap, alloc_disp, alloc_size);
  1278. alloc_base = reinterpret_cast<void*>(ap->base);
  1279. }
  1280. m_vars_extra[i].alloc = m_vars[i].alloc;
  1281. m_vars_extra[i].auto_data = 0;
  1282. m_vars_extra[i].cpu_disp = 0;
  1283. m_vars_extra[i].cpu_offset = 0;
  1284. m_vars_extra[i].src_data = 0;
  1285. m_vars_extra[i].read_rng_src = 0;
  1286. m_vars_extra[i].read_rng_dst = 0;
  1287. m_vars_extra[i].omp_last_event_type = c_last_not;
  1288. // flag is_arr_ptr_el is 1 only for var_descs generated
  1289. // for c_data_ptr_array type
  1290. if (i < vars_total) {
  1291. m_vars_extra[i].is_arr_ptr_el = 0;
  1292. }
  1293. if (TYPE_IS_PTR_TO_PTR(m_vars_extra[i].type_src) ||
  1294. TYPE_IS_PTR_TO_PTR(m_vars_extra[i].type_dst) ||
  1295. m_vars[i].flags.is_pointer) {
  1296. m_vars_extra[i].pointer_offset = m_vars[i].offset;
  1297. m_vars[i].offset = 0;
  1298. m_in_datalen += sizeof(m_vars[i].offset);
  1299. }
  1300. switch (m_vars_extra[i].type_src) {
  1301. case c_data_ptr_array:
  1302. {
  1303. const Arr_Desc *ap;
  1304. const VarDesc3 *vd3 =
  1305. static_cast<const VarDesc3*>(m_vars[i].ptr);
  1306. int flags = vd3->array_fields;
  1307. OFFLOAD_TRACE(2,
  1308. " pointer array flags = %04x\n", flags);
  1309. OFFLOAD_TRACE(2,
  1310. " pointer array type is %s\n",
  1311. vardesc_type_as_string[flags & 0x3f]);
  1312. ap = static_cast<const Arr_Desc*>(vd3->ptr_array);
  1313. ARRAY_DESC_DUMP(" ", "ptr array", ap,
  1314. m_vars[i].flags.is_pointer, 1);
  1315. if (m_vars[i].into) {
  1316. ap = static_cast<const Arr_Desc*>(m_vars[i].into);
  1317. ARRAY_DESC_DUMP(
  1318. " ", "into array", ap, 0, 1);
  1319. }
  1320. if ((flags & (1<<flag_align_is_array)) != 0) {
  1321. ap = static_cast<const Arr_Desc*>(vd3->align_array);
  1322. ARRAY_DESC_DUMP(
  1323. " ", "align array", ap, 0, 1);
  1324. }
  1325. if ((flags & (1<<flag_alloc_if_is_array)) != 0) {
  1326. ap = static_cast<const Arr_Desc*>(vd3->alloc_if_array);
  1327. ARRAY_DESC_DUMP(
  1328. " ", "alloc_if array", ap, 0, 1);
  1329. }
  1330. if ((flags & (1<<flag_free_if_is_array)) != 0) {
  1331. ap = static_cast<const Arr_Desc*>(vd3->free_if_array);
  1332. ARRAY_DESC_DUMP(
  1333. " ", "free_if array", ap, 0, 1);
  1334. }
  1335. if ((flags & (1<<flag_extent_start_is_array)) != 0) {
  1336. ap = static_cast<const Arr_Desc*>(vd3->extent_start);
  1337. ARRAY_DESC_DUMP(
  1338. " ", "extent_start array", ap, 0, 1);
  1339. } else if ((flags &
  1340. (1<<flag_extent_start_is_scalar)) != 0) {
  1341. OFFLOAD_TRACE(2,
  1342. " extent_start scalar = %d\n",
  1343. (int64_t)vd3->extent_start);
  1344. }
  1345. if ((flags & (1<<flag_extent_elements_is_array)) != 0) {
  1346. ap = static_cast<const Arr_Desc*>
  1347. (vd3->extent_elements);
  1348. ARRAY_DESC_DUMP(" ",
  1349. "extent_elements array", ap, 0, 1);
  1350. } else if ((flags &
  1351. (1<<flag_extent_elements_is_scalar)) != 0) {
  1352. OFFLOAD_TRACE(2,
  1353. " extent_elements scalar = %d\n",
  1354. (int64_t)vd3->extent_elements);
  1355. }
  1356. if ((flags & (1<<flag_into_start_is_array)) != 0) {
  1357. ap = static_cast<const Arr_Desc*>(vd3->into_start);
  1358. ARRAY_DESC_DUMP(
  1359. " ", "into_start array", ap, 0, 1);
  1360. } else if ((flags &
  1361. (1<<flag_into_start_is_scalar)) != 0) {
  1362. OFFLOAD_TRACE(2,
  1363. " into_start scalar = %d\n",
  1364. (int64_t)vd3->into_start);
  1365. }
  1366. if ((flags & (1<<flag_into_elements_is_array)) != 0) {
  1367. ap = static_cast<const Arr_Desc*>(vd3->into_elements);
  1368. ARRAY_DESC_DUMP(
  1369. " ", "into_elements array", ap, 0, 1);
  1370. } else if ((flags &
  1371. (1<<flag_into_elements_is_scalar)) != 0) {
  1372. OFFLOAD_TRACE(2,
  1373. " into_elements scalar = %d\n",
  1374. (int64_t)vd3->into_elements);
  1375. }
  1376. if ((flags & (1<<flag_alloc_start_is_array)) != 0) {
  1377. ap = static_cast<const Arr_Desc*>(vd3->alloc_start);
  1378. ARRAY_DESC_DUMP(
  1379. " ", "alloc_start array", ap, 0, 1);
  1380. } else if ((flags &
  1381. (1<<flag_alloc_start_is_scalar)) != 0) {
  1382. OFFLOAD_TRACE(2,
  1383. " alloc_start scalar = %d\n",
  1384. (int64_t)vd3->alloc_start);
  1385. }
  1386. if ((flags & (1<<flag_alloc_elements_is_array)) != 0) {
  1387. ap = static_cast<const Arr_Desc*>(vd3->alloc_elements);
  1388. ARRAY_DESC_DUMP(" ",
  1389. "alloc_elements array", ap, 0, 1);
  1390. } else if ((flags &
  1391. (1<<flag_alloc_elements_is_scalar)) != 0) {
  1392. OFFLOAD_TRACE(2,
  1393. " alloc_elements scalar = %d\n",
  1394. (int64_t)vd3->alloc_elements);
  1395. }
  1396. }
  1397. if (!gen_var_descs_for_pointer_array(i)) {
  1398. return false;
  1399. }
  1400. break;
  1401. case c_data:
  1402. case c_void_ptr:
  1403. case c_void_ptr_ptr:
  1404. case c_cean_var:
  1405. // In all uses later
  1406. // VarDesc.size will have the length of the data to be
  1407. // transferred
  1408. // VarDesc.disp will have an offset from base
  1409. if (m_vars[i].flags.is_non_cont_struct && src_is_for_host) {
  1410. NonContigDesc *desc =
  1411. static_cast<NonContigDesc*>(m_vars[i].ptr);
  1412. noncont_struct_dump(" ", "DATA", desc);
  1413. m_vars_extra[i].noncont_desc = desc;
  1414. m_vars[i].ptr = reinterpret_cast<void*>(desc->base);
  1415. m_vars[i].size = get_noncont_struct_size(desc);
  1416. m_vars[i].disp = 0;
  1417. }
  1418. else if (m_vars_extra[i].type_src == c_cean_var) {
  1419. // array descriptor
  1420. const Arr_Desc *ap =
  1421. static_cast<const Arr_Desc*>(m_vars[i].ptr);
  1422. // debug dump
  1423. ARRAY_DESC_DUMP("", "IN/OUT", ap, 0, !src_is_for_mic);
  1424. // offset and length are derived from the array descriptor
  1425. __arr_data_offset_and_length(ap, m_vars[i].disp,
  1426. m_vars[i].size);
  1427. if (!is_arr_desc_contiguous(ap)) {
  1428. m_vars[i].flags.is_noncont_src = 1;
  1429. m_vars_extra[i].read_rng_src =
  1430. init_read_ranges_arr_desc(ap);
  1431. }
  1432. // all necessary information about length and offset is
  1433. // transferred in var descriptor. There is no need to send
  1434. // array descriptor to the target side.
  1435. m_vars[i].ptr = reinterpret_cast<void*>(ap->base);
  1436. }
  1437. else {
  1438. m_vars[i].size *= m_vars[i].count;
  1439. m_vars[i].disp = 0;
  1440. }
  1441. if (m_vars[i].direction.bits) {
  1442. // make sure that transfer size > 0
  1443. if (m_vars[i].size <= 0) {
  1444. LIBOFFLOAD_ERROR(c_zero_or_neg_transfer_size);
  1445. exit(1);
  1446. }
  1447. if (m_vars[i].flags.is_static) {
  1448. PtrData *ptr_data;
  1449. // find data associated with variable
  1450. if (!find_ptr_data(ptr_data,
  1451. m_vars[i].ptr,
  1452. m_vars[i].disp,
  1453. m_vars[i].size,
  1454. false, false)) {
  1455. return false;
  1456. }
  1457. if (ptr_data != 0) {
  1458. // offset to base from the beginning of the buffer
  1459. // memory
  1460. m_vars[i].offset =
  1461. (char*) m_vars[i].ptr -
  1462. (char*) ptr_data->cpu_addr.start();
  1463. }
  1464. else {
  1465. m_vars[i].flags.is_static = false;
  1466. if (m_vars[i].into == NULL) {
  1467. m_vars[i].flags.is_static_dstn = false;
  1468. }
  1469. }
  1470. m_vars_extra[i].src_data = ptr_data;
  1471. }
  1472. if (m_vars[i].direction.in &&
  1473. !m_vars[i].flags.is_static &&
  1474. !m_vars[i].flags.is_stack_buf) {
  1475. m_in_datalen += m_vars[i].size;
  1476. // for non-static target destination defined as CEAN
  1477. // expression we pass to target its size and dist
  1478. if (m_vars[i].into == NULL &&
  1479. m_vars_extra[i].type_src == c_cean_var) {
  1480. m_in_datalen += 2 * sizeof(uint64_t);
  1481. }
  1482. m_need_runfunction = true;
  1483. }
  1484. if (m_vars[i].direction.out &&
  1485. !m_vars[i].flags.is_static &&
  1486. !m_vars[i].flags.is_stack_buf) {
  1487. m_out_datalen += m_vars[i].size;
  1488. m_need_runfunction = true;
  1489. }
  1490. }
  1491. if (m_is_openmp && src_is_for_host &&
  1492. !m_vars[i].flags.is_device_ptr) {
  1493. if (m_vars[i].flags.is_static) {
  1494. PtrData *ptr_data = m_vars_extra[i].src_data;
  1495. // Static data is transferred either by omp target
  1496. // update construct which passes zeros for
  1497. // alloc_if and free_if or by always modifier.
  1498. // Implicit openmp reference is transfered also
  1499. // if its reference count is equal to 1
  1500. if (ptr_data &&
  1501. IS_OPENMP_IMPLICIT_OR_LINK(ptr_data->var_alloc_type)) {
  1502. if (m_vars[i].alloc_if) {
  1503. ptr_data->add_reference();
  1504. }
  1505. if (!m_vars[i].flags.always_copy &&
  1506. (m_vars[i].alloc_if || m_vars[i].free_if) &&
  1507. ptr_data->get_reference() != 1) {
  1508. m_vars[i].direction.bits = c_parameter_nocopy;
  1509. }
  1510. }
  1511. else if (
  1512. !m_vars[i].flags.always_copy &&
  1513. (m_vars[i].alloc_if || m_vars[i].free_if)) {
  1514. m_vars[i].direction.bits = c_parameter_nocopy;
  1515. }
  1516. }
  1517. else {
  1518. AutoData *auto_data;
  1519. if (m_vars[i].alloc_if) {
  1520. auto_data = m_device.insert_auto_data(
  1521. m_vars[i].ptr, m_vars[i].size);
  1522. auto_data->add_reference();
  1523. }
  1524. else {
  1525. // TODO: what should be done if var is not in
  1526. // the table?
  1527. auto_data = m_device.find_auto_data(
  1528. m_vars[i].ptr);
  1529. }
  1530. // For automatic variables data is transferred:
  1531. // - if always modifier is used OR
  1532. // - if alloc_if == 0 && free_if == 0 OR
  1533. // - if reference count is 1
  1534. if (!m_vars[i].flags.always_copy &&
  1535. (m_vars[i].alloc_if || m_vars[i].free_if) &&
  1536. auto_data != 0 &&
  1537. auto_data->get_reference() != 1) {
  1538. m_vars[i].direction.bits = c_parameter_nocopy;
  1539. }
  1540. // save data for later use
  1541. m_vars_extra[i].auto_data = auto_data;
  1542. }
  1543. }
  1544. break;
  1545. case c_dv:
  1546. if (m_vars[i].flags.use_device_ptr) {
  1547. setup_use_device_ptr(i);
  1548. break;
  1549. }
  1550. else if (m_vars[i].direction.bits ||
  1551. m_vars[i].alloc_if ||
  1552. m_vars[i].free_if) {
  1553. ArrDesc *dvp = static_cast<ArrDesc*>(m_vars[i].ptr);
  1554. // debug dump
  1555. __dv_desc_dump("IN/OUT", dvp);
  1556. // send dope vector contents excluding base
  1557. m_in_datalen += m_vars[i].size - sizeof(uint64_t);
  1558. m_need_runfunction = true;
  1559. }
  1560. break;
  1561. case c_string_ptr:
  1562. case c_string_ptr_ptr:
  1563. if ((m_vars[i].direction.bits ||
  1564. m_vars[i].alloc_if ||
  1565. m_vars[i].free_if) &&
  1566. m_vars[i].size == 0) {
  1567. m_vars[i].size = 1;
  1568. m_vars[i].count =
  1569. strlen(*static_cast<char**>(m_vars[i].ptr)) + 1;
  1570. }
  1571. /* fallthru */
  1572. case c_data_ptr:
  1573. case c_data_ptr_ptr:
  1574. if (m_vars[i].flags.is_stack_buf &&
  1575. !m_vars[i].direction.bits &&
  1576. m_vars[i].alloc_if) {
  1577. // this var_desc is for stack buffer
  1578. bool is_new;
  1579. if (!offload_stack_memory_manager(
  1580. stack_addr, entry_id,
  1581. m_vars[i].count, m_vars[i].align,
  1582. thread_specific_function_locals, &is_new)) {
  1583. return false;
  1584. }
  1585. if (is_new) {
  1586. m_compute_buffers.push_back(
  1587. m_stack_ptr_data->mic_buf);
  1588. m_device.m_persist_list.front().cpu_stack_addr =
  1589. static_cast<char*>(m_vars[i].ptr);
  1590. PersistData *new_el = &m_device.m_persist_list.front();
  1591. print_persistList_item(
  1592. "New element in persist list:",
  1593. new_el);
  1594. }
  1595. else {
  1596. m_vars[i].flags.sink_addr = 1;
  1597. m_in_datalen += sizeof(m_stack_ptr_data->mic_addr);
  1598. if (thread_specific_function_locals) {
  1599. m_stack_ptr_data = get_this_threads_mic_stack_addr(
  1600. stack_addr, entry_id,
  1601. thread_specific_function_locals);
  1602. }
  1603. }
  1604. m_vars[i].size = m_destroy_stack.size();
  1605. m_vars_extra[i].src_data = m_stack_ptr_data;
  1606. // need to add or remove references for stack buffer at target
  1607. if (is_new || m_destroy_stack.size()) {
  1608. m_need_runfunction = true;
  1609. }
  1610. break;
  1611. }
  1612. /* fallthru */
  1613. case c_cean_var_ptr:
  1614. case c_cean_var_ptr_ptr:
  1615. case c_dv_ptr:
  1616. if (m_vars[i].flags.is_non_cont_struct && src_is_for_host) {
  1617. NonContigDesc *desc =
  1618. static_cast<NonContigDesc*>(m_vars[i].ptr);
  1619. noncont_struct_dump(" ", "PTR", desc);
  1620. m_vars_extra[i].noncont_desc = desc;
  1621. m_vars[i].ptr = reinterpret_cast<void*>(desc->base);
  1622. m_vars[i].disp = 0;
  1623. }
  1624. else if (m_vars_extra[i].type_src == c_cean_var_ptr ||
  1625. m_vars_extra[i].type_src == c_cean_var_ptr_ptr) {
  1626. // array descriptor
  1627. const Arr_Desc *ap =
  1628. static_cast<const Arr_Desc*>(m_vars[i].ptr);
  1629. // debug dump
  1630. ARRAY_DESC_DUMP("", "IN/OUT", ap, 1, !src_is_for_mic);
  1631. // offset and length are derived from the array descriptor
  1632. __arr_data_offset_and_length(ap, m_vars[i].disp,
  1633. m_vars[i].size);
  1634. if (!is_arr_desc_contiguous(ap)) {
  1635. m_vars[i].flags.is_noncont_src = 1;
  1636. m_vars_extra[i].read_rng_src =
  1637. init_read_ranges_arr_desc(ap);
  1638. }
  1639. // all necessary information about length and offset is
  1640. // transferred in var descriptor. There is no need to send
  1641. // array descriptor to the target side.
  1642. m_vars[i].ptr = reinterpret_cast<void*>(ap->base);
  1643. }
  1644. else if (m_vars_extra[i].type_src == c_dv_ptr) {
  1645. // need to send DV to the device unless it is 'nocopy'
  1646. if (m_vars[i].direction.bits ||
  1647. m_vars[i].alloc_if ||
  1648. m_vars[i].free_if) {
  1649. ArrDesc *dvp = *static_cast<ArrDesc**>(m_vars[i].ptr);
  1650. // debug dump
  1651. __dv_desc_dump("IN/OUT", dvp);
  1652. // for use_device_ptr don't need to change
  1653. // OUT direction to IN direction
  1654. if (!m_vars[i].flags.use_device_ptr) {
  1655. m_vars[i].direction.bits = c_parameter_in;
  1656. }
  1657. }
  1658. // no displacement
  1659. m_vars[i].disp = 0;
  1660. }
  1661. else {
  1662. // For "use_device_ptr" if direction is "in" then need to
  1663. // find the associated device pointer and replace the host
  1664. // pointer with device pointer. Also save the host pointer
  1665. // to restore when "out" is encountered.
  1666. // For "out" find the host pointer associated with the
  1667. // device pointer and restore the host pointer
  1668. if (m_vars[i].flags.use_device_ptr && src_is_for_host) {
  1669. setup_use_device_ptr(i);
  1670. break;
  1671. }
  1672. // c_data_ptr or c_string_ptr
  1673. m_vars[i].size *= m_vars[i].count;
  1674. m_vars[i].disp = 0;
  1675. }
  1676. if (m_vars[i].direction.bits ||
  1677. m_vars[i].alloc_if ||
  1678. m_vars[i].free_if) {
  1679. PtrData *ptr_data;
  1680. // check that buffer length > 0
  1681. if (m_vars[i].alloc_if &&
  1682. m_vars[i].disp + m_vars[i].size <
  1683. (m_is_openmp ? 0 : 1)) {
  1684. LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len);
  1685. exit(1);
  1686. }
  1687. // base address
  1688. void *base = *static_cast<void**>(m_vars[i].ptr);
  1689. // allocate buffer if we have no INTO and don't need
  1690. // allocation for the ptr at target
  1691. if (src_is_for_mic) {
  1692. if (m_vars[i].flags.is_stack_buf) {
  1693. // for stack persistent objects ptr data is created
  1694. // by var_desc with number 0.
  1695. // Its ptr_data is stored at m_stack_ptr_data
  1696. ptr_data = m_stack_ptr_data;
  1697. }
  1698. else if (m_vars[i].alloc_if) {
  1699. if (m_vars[i].flags.preallocated) {
  1700. m_out_datalen += sizeof(void*);
  1701. m_need_runfunction = true;
  1702. break;
  1703. }
  1704. // add new entry
  1705. if (!alloc_ptr_data(
  1706. ptr_data,
  1707. reinterpret_cast<char *>(base) + alloc_disp,
  1708. (alloc_base != NULL) ?
  1709. alloc_disp : m_vars[i].disp,
  1710. (alloc_base != NULL) ?
  1711. alloc_size : m_vars[i].size,
  1712. alloc_disp,
  1713. (alloc_base != NULL) ?
  1714. 0 : m_vars[i].align,
  1715. m_vars[i].flags.targetptr,
  1716. 0,
  1717. m_vars[i].flags.pin)) {
  1718. return false;
  1719. }
  1720. if (m_vars[i].flags.targetptr) {
  1721. if (!init_mic_address(ptr_data)) {
  1722. return false;
  1723. }
  1724. *static_cast<void**>(m_vars[i].ptr) = base =
  1725. reinterpret_cast<void*>(ptr_data->mic_addr);
  1726. }
  1727. if (ptr_data->add_reference() == 0 &&
  1728. ptr_data->mic_buf != 0) {
  1729. // add buffer to the list of buffers that
  1730. // are passed to dispatch call
  1731. m_compute_buffers.push_back(
  1732. ptr_data->mic_buf);
  1733. }
  1734. else if (!m_vars[i].flags.pin &&
  1735. !m_vars[i].flags.preallocated) {
  1736. // will send buffer address to device
  1737. m_vars[i].flags.sink_addr = 1;
  1738. m_in_datalen += sizeof(ptr_data->mic_addr);
  1739. }
  1740. if (!m_vars[i].flags.pin &&
  1741. !ptr_data->is_static) {
  1742. // need to add reference for buffer
  1743. m_need_runfunction = true;
  1744. }
  1745. }
  1746. else {
  1747. bool error_if_not_found = true;
  1748. if (m_is_openmp) {
  1749. // For omp target update variable is ignored
  1750. // if it does not exist.
  1751. if (m_vars[i].flags.always_copy ||
  1752. (!m_vars[i].alloc_if &&
  1753. !m_vars[i].free_if)) {
  1754. error_if_not_found = false;
  1755. }
  1756. }
  1757. // use existing association from pointer table
  1758. if (!find_ptr_data(ptr_data,
  1759. base,
  1760. m_vars[i].disp,
  1761. m_vars[i].size,
  1762. m_vars[i].flags.targetptr,
  1763. error_if_not_found)) {
  1764. return false;
  1765. }
  1766. if (m_is_openmp) {
  1767. // make var nocopy if it does not exist
  1768. if (ptr_data == 0) {
  1769. m_vars[i].direction.bits =
  1770. c_parameter_nocopy;
  1771. }
  1772. }
  1773. if (ptr_data != 0) {
  1774. m_vars[i].flags.sink_addr = 1;
  1775. m_in_datalen += sizeof(ptr_data->mic_addr);
  1776. }
  1777. }
  1778. if (ptr_data != 0) {
  1779. if (ptr_data->alloc_disp != 0) {
  1780. m_vars[i].flags.alloc_disp = 1;
  1781. m_in_datalen += sizeof(alloc_disp);
  1782. }
  1783. if (m_vars[i].flags.sink_addr) {
  1784. // get buffers's address on the sink
  1785. if (!init_mic_address(ptr_data)) {
  1786. return false;
  1787. }
  1788. m_in_datalen += sizeof(ptr_data->mic_addr);
  1789. }
  1790. if (!m_vars[i].flags.pin &&
  1791. !ptr_data->is_static && m_vars[i].free_if) {
  1792. // need to decrement buffer reference on target
  1793. m_need_runfunction = true;
  1794. }
  1795. // offset to base from the beginning of the buffer
  1796. // memory
  1797. m_vars[i].offset = (char*) base -
  1798. (char*) ptr_data->cpu_addr.start();
  1799. // copy other pointer properties to var descriptor
  1800. m_vars[i].mic_offset = ptr_data->mic_offset;
  1801. m_vars[i].flags.is_static = ptr_data->is_static;
  1802. }
  1803. }
  1804. else {
  1805. if (!find_ptr_data(ptr_data,
  1806. base,
  1807. m_vars[i].disp,
  1808. m_vars[i].size,
  1809. false, false)) {
  1810. return false;
  1811. }
  1812. if (ptr_data) {
  1813. m_vars[i].offset =
  1814. (char*) base -
  1815. (char*) ptr_data->cpu_addr.start();
  1816. }
  1817. }
  1818. if (m_is_openmp) {
  1819. if (m_vars[i].flags.use_device_ptr) {
  1820. setup_use_device_ptr(i);
  1821. }
  1822. // for TO transfer of stack buffer's variable
  1823. if (src_is_for_host && m_vars[i].flags.is_stack_buf) {
  1824. AutoData *auto_data;
  1825. char *base = *static_cast<char**>(m_vars[i].ptr);
  1826. if (m_vars[i].alloc_if) {
  1827. auto_data =m_device.insert_auto_data(
  1828. base + m_vars[i].disp,
  1829. m_vars[i].size);
  1830. auto_data->add_reference();
  1831. }
  1832. else {
  1833. auto_data = m_device.find_auto_data(
  1834. base + m_vars[i].disp);
  1835. }
  1836. // save data for later use
  1837. m_vars_extra[i].auto_data = auto_data;
  1838. // For automatic variables
  1839. // data is transferred:
  1840. // - if always modifier is used OR
  1841. // - if alloc_if == 0 && free_if == 0 OR
  1842. // - if reference count is 1
  1843. if (!m_vars[i].flags.always_copy &&
  1844. (m_vars[i].alloc_if ||
  1845. m_vars[i].free_if) &&
  1846. auto_data != 0 &&
  1847. auto_data->get_reference() != 1) {
  1848. m_vars[i].direction.bits =
  1849. c_parameter_nocopy;
  1850. }
  1851. }
  1852. // for FROM transfer of global pointer variable
  1853. // FROM transfer of stack buffer's variable
  1854. // is treated at INTO branch
  1855. else if (src_is_for_mic &&
  1856. !m_vars[i].flags.is_stack_buf) {
  1857. // data is transferred only if
  1858. // alloc_if == 0 && free_if == 0
  1859. // or reference count is 1
  1860. if (!m_vars[i].flags.always_copy &&
  1861. (m_vars[i].alloc_if ||
  1862. m_vars[i].free_if) &&
  1863. ptr_data &&
  1864. ptr_data->get_reference() != 1)
  1865. {
  1866. m_vars[i].direction.bits =
  1867. c_parameter_nocopy;
  1868. }
  1869. }
  1870. }
  1871. // save pointer data
  1872. m_vars_extra[i].src_data = ptr_data;
  1873. }
  1874. break;
  1875. case c_func_ptr:
  1876. case c_func_ptr_ptr:
  1877. if (m_vars[i].direction.in) {
  1878. m_in_datalen += __offload_funcs.max_name_length();
  1879. }
  1880. if (m_vars[i].direction.out) {
  1881. m_out_datalen += __offload_funcs.max_name_length();
  1882. }
  1883. m_need_runfunction = true;
  1884. break;
  1885. case c_dv_data:
  1886. case c_dv_ptr_data:
  1887. case c_dv_data_slice:
  1888. case c_dv_ptr_data_slice:
  1889. ArrDesc *dvp;
  1890. if (m_vars[i].flags.is_non_cont_struct) {
  1891. NonContigDesc *desc =
  1892. static_cast<NonContigDesc*>(m_vars[i].ptr);
  1893. noncont_struct_dump(" ", "DV-DATA", desc);
  1894. dvp = reinterpret_cast<ArrDesc*>(desc->base);
  1895. }
  1896. else if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_src)) {
  1897. const Arr_Desc *ap;
  1898. ap = static_cast<const Arr_Desc*>(m_vars[i].ptr);
  1899. dvp = (m_vars_extra[i].type_src == c_dv_data_slice) ?
  1900. reinterpret_cast<ArrDesc*>(ap->base) :
  1901. *reinterpret_cast<ArrDesc**>(ap->base);
  1902. }
  1903. else {
  1904. dvp = (m_vars_extra[i].type_src == c_dv_data) ?
  1905. static_cast<ArrDesc*>(m_vars[i].ptr) :
  1906. *static_cast<ArrDesc**>(m_vars[i].ptr);
  1907. }
  1908. // if allocatable dope vector isn't allocated don't
  1909. // transfer its data
  1910. if (!__dv_is_allocated(dvp)) {
  1911. m_vars[i].direction.bits = c_parameter_nocopy;
  1912. m_vars[i].alloc_if = 0;
  1913. m_vars[i].free_if = 0;
  1914. }
  1915. if (m_vars[i].direction.bits ||
  1916. m_vars[i].alloc_if ||
  1917. m_vars[i].free_if) {
  1918. const Arr_Desc *ap;
  1919. if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_src)) {
  1920. ap = static_cast<const Arr_Desc*>(m_vars[i].ptr);
  1921. // debug dump
  1922. ARRAY_DESC_DUMP("", "IN/OUT", ap, 0, !src_is_for_mic);
  1923. }
  1924. if (!__dv_is_contiguous(dvp)) {
  1925. m_vars[i].flags.is_noncont_src = 1;
  1926. m_vars_extra[i].read_rng_src =
  1927. init_read_ranges_dv(dvp);
  1928. }
  1929. // size and displacement
  1930. if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_src)) {
  1931. // offset and length are derived from the
  1932. // array descriptor
  1933. __arr_data_offset_and_length(ap,
  1934. m_vars[i].disp,
  1935. m_vars[i].size);
  1936. if (m_vars[i].direction.bits) {
  1937. if (!is_arr_desc_contiguous(ap)) {
  1938. if (m_vars[i].flags.is_noncont_src) {
  1939. LIBOFFLOAD_ERROR(c_slice_of_noncont_array);
  1940. return false;
  1941. }
  1942. m_vars[i].flags.is_noncont_src = 1;
  1943. m_vars_extra[i].read_rng_src =
  1944. init_read_ranges_arr_desc(ap);
  1945. }
  1946. }
  1947. }
  1948. else {
  1949. if (m_vars[i].flags.has_length) {
  1950. m_vars[i].size =
  1951. __dv_data_length(dvp, m_vars[i].count);
  1952. }
  1953. else {
  1954. m_vars[i].size = __dv_data_length(dvp);
  1955. }
  1956. m_vars[i].disp = 0;
  1957. }
  1958. // check that length >= 0
  1959. if (m_vars[i].alloc_if &&
  1960. (m_vars[i].disp + m_vars[i].size < 0)) {
  1961. LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len);
  1962. exit(1);
  1963. }
  1964. // base address
  1965. void *base = reinterpret_cast<void*>(dvp->Base);
  1966. PtrData *ptr_data;
  1967. // allocate buffer if we have no INTO and don't need
  1968. // allocation for the ptr at target
  1969. if (src_is_for_mic) {
  1970. if (m_vars[i].alloc_if) {
  1971. // add new entry
  1972. if (!alloc_ptr_data(
  1973. ptr_data,
  1974. reinterpret_cast<char *>(base) + alloc_disp,
  1975. (alloc_base != NULL) ?
  1976. alloc_disp : m_vars[i].disp,
  1977. (alloc_base != NULL) ?
  1978. alloc_size : m_vars[i].size,
  1979. alloc_disp,
  1980. (alloc_base != NULL) ?
  1981. 0 : m_vars[i].align,
  1982. m_vars[i].flags.targetptr,
  1983. m_vars[i].flags.preallocated,
  1984. m_vars[i].flags.pin)) {
  1985. return false;
  1986. }
  1987. if (ptr_data->add_reference() == 0 &&
  1988. ptr_data->mic_buf != 0) {
  1989. // add buffer to the list of buffers
  1990. // that are passed to dispatch call
  1991. m_compute_buffers.push_back(
  1992. ptr_data->mic_buf);
  1993. }
  1994. else {
  1995. // will send buffer address to device
  1996. m_vars[i].flags.sink_addr = 1;
  1997. }
  1998. if (!ptr_data->is_static) {
  1999. // need to add reference for buffer
  2000. m_need_runfunction = true;
  2001. }
  2002. }
  2003. else {
  2004. bool error_if_not_found = true;
  2005. if (m_is_openmp) {
  2006. // For omp target update variable is ignored
  2007. // if it does not exist.
  2008. if (m_vars[i].flags.always_copy ||
  2009. (!m_vars[i].alloc_if &&
  2010. !m_vars[i].free_if)) {
  2011. error_if_not_found = false;
  2012. }
  2013. }
  2014. // use existing association from pointer table
  2015. if (!find_ptr_data(ptr_data,
  2016. base,
  2017. m_vars[i].disp,
  2018. m_vars[i].size,
  2019. m_vars[i].flags.targetptr,
  2020. error_if_not_found)) {
  2021. return false;
  2022. }
  2023. if (m_is_openmp) {
  2024. // make var nocopy if it does not exist
  2025. if (ptr_data == 0) {
  2026. m_vars[i].direction.bits =
  2027. c_parameter_nocopy;
  2028. }
  2029. }
  2030. if (ptr_data != 0) {
  2031. // need to update base in dope vector on device
  2032. m_vars[i].flags.sink_addr = 1;
  2033. }
  2034. }
  2035. if (ptr_data != 0) {
  2036. if (m_is_openmp) {
  2037. // data is transferred if
  2038. // - if always modifier is used OR
  2039. // - if alloc_if == 0 && free_if == 0 OR
  2040. // - if reference count is 1
  2041. if (!m_vars[i].flags.always_copy &&
  2042. (m_vars[i].alloc_if ||
  2043. m_vars[i].free_if) &&
  2044. ptr_data->get_reference() != 1) {
  2045. m_vars[i].direction.bits =
  2046. c_parameter_nocopy;
  2047. }
  2048. }
  2049. if (ptr_data->alloc_disp != 0) {
  2050. m_vars[i].flags.alloc_disp = 1;
  2051. m_in_datalen += sizeof(alloc_disp);
  2052. }
  2053. if (m_vars[i].flags.sink_addr) {
  2054. // get buffers's address on the sink
  2055. if (!init_mic_address(ptr_data)) {
  2056. return false;
  2057. }
  2058. m_in_datalen += sizeof(ptr_data->mic_addr);
  2059. }
  2060. if (!ptr_data->is_static && m_vars[i].free_if) {
  2061. // need to decrement buffer reference on target
  2062. m_need_runfunction = true;
  2063. }
  2064. // offset to base from the beginning of the buffer
  2065. // memory
  2066. m_vars[i].offset =
  2067. (char*) base -
  2068. (char*) ptr_data->cpu_addr.start();
  2069. // copy other pointer properties to var descriptor
  2070. m_vars[i].mic_offset = ptr_data->mic_offset;
  2071. m_vars[i].flags.is_static = ptr_data->is_static;
  2072. }
  2073. }
  2074. else { // !src_is_for_mic
  2075. if (!find_ptr_data(ptr_data,
  2076. base,
  2077. m_vars[i].disp,
  2078. m_vars[i].size,
  2079. false, false)) {
  2080. return false;
  2081. }
  2082. m_vars[i].offset = !ptr_data ? 0 :
  2083. (char*) base -
  2084. (char*) ptr_data->cpu_addr.start();
  2085. }
  2086. // save pointer data
  2087. m_vars_extra[i].src_data = ptr_data;
  2088. }
  2089. break;
  2090. default:
  2091. LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars_extra[i].type_src);
  2092. LIBOFFLOAD_ABORT;
  2093. }
  2094. if (m_vars_extra[i].type_src == c_data_ptr_array) {
  2095. continue;
  2096. }
  2097. if (src_is_for_mic && m_vars[i].flags.is_stack_buf) {
  2098. if (this_threads_cpu_stack_addr == 0) {
  2099. this_threads_cpu_stack_addr =
  2100. get_this_threads_cpu_stack_addr(
  2101. stack_addr, entry_id, thread_specific_function_locals);
  2102. }
  2103. m_vars[i].offset = static_cast<char*>
  2104. (m_vars[i].ptr) -
  2105. this_threads_cpu_stack_addr;
  2106. }
  2107. // if source is used at CPU save its offset and disp
  2108. if (m_vars[i].into == NULL || m_vars[i].direction.in) {
  2109. m_vars_extra[i].cpu_offset = m_vars[i].offset;
  2110. m_vars_extra[i].cpu_disp = m_vars[i].disp;
  2111. }
  2112. // If "into" is define we need to do the similar work for it
  2113. if (!m_vars[i].into) {
  2114. continue;
  2115. }
  2116. int64_t into_disp =0, into_offset = 0;
  2117. switch (m_vars_extra[i].type_dst) {
  2118. case c_data_ptr_array:
  2119. break;
  2120. case c_data:
  2121. case c_void_ptr:
  2122. case c_void_ptr_ptr:
  2123. case c_cean_var: {
  2124. int64_t size = m_vars[i].size;
  2125. if (m_vars[i].flags.is_non_cont_struct && src_is_for_mic) {
  2126. NonContigDesc *desc =
  2127. static_cast<NonContigDesc*>(m_vars[i].into);
  2128. noncont_struct_dump("", "INTO DATA", desc);
  2129. m_vars_extra[i].noncont_desc = desc;
  2130. m_vars[i].into = reinterpret_cast<void*>(desc->base);
  2131. size = get_noncont_struct_size(desc);
  2132. into_disp = 0;
  2133. }
  2134. else if (m_vars_extra[i].type_dst == c_cean_var) {
  2135. // array descriptor
  2136. const Arr_Desc *ap =
  2137. static_cast<const Arr_Desc*>(m_vars[i].into);
  2138. // debug dump
  2139. ARRAY_DESC_DUMP(" ", "INTO", ap, 0, src_is_for_mic);
  2140. // offset and length are derived from the array descriptor
  2141. __arr_data_offset_and_length(ap, into_disp, size);
  2142. if (!is_arr_desc_contiguous(ap)) {
  2143. m_vars[i].flags.is_noncont_dst = 1;
  2144. m_vars_extra[i].read_rng_dst =
  2145. init_read_ranges_arr_desc(ap);
  2146. if (!cean_ranges_match(
  2147. m_vars_extra[i].read_rng_src,
  2148. m_vars_extra[i].read_rng_dst)) {
  2149. LIBOFFLOAD_ERROR(c_ranges_dont_match);
  2150. exit(1);
  2151. }
  2152. }
  2153. m_vars[i].into = reinterpret_cast<void*>(ap->base);
  2154. }
  2155. int64_t size_src = m_vars_extra[i].read_rng_src &&
  2156. !m_vars[i].flags.is_non_cont_struct ?
  2157. cean_get_transf_size(m_vars_extra[i].read_rng_src) :
  2158. m_vars[i].size;
  2159. int64_t size_dst = m_vars_extra[i].read_rng_dst ?
  2160. cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
  2161. size;
  2162. // It's supposed that "into" size must be not less
  2163. // than src size
  2164. if (size_src > size_dst) {
  2165. LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
  2166. size_src, size_dst);
  2167. exit(1);
  2168. }
  2169. if (m_vars[i].direction.bits) {
  2170. if (m_vars[i].flags.is_static_dstn) {
  2171. PtrData *ptr_data;
  2172. // find data associated with variable
  2173. if (!find_ptr_data(ptr_data, m_vars[i].into,
  2174. into_disp, size, false, false)) {
  2175. return false;
  2176. }
  2177. if (ptr_data != 0) {
  2178. // offset to base from the beginning of the buffer
  2179. // memory
  2180. into_offset =
  2181. (char*) m_vars[i].into -
  2182. (char*) ptr_data->cpu_addr.start();
  2183. }
  2184. else {
  2185. m_vars[i].flags.is_static_dstn = false;
  2186. }
  2187. m_vars_extra[i].dst_data = ptr_data;
  2188. }
  2189. }
  2190. if (m_vars[i].direction.in &&
  2191. !m_vars[i].flags.is_static_dstn) {
  2192. m_in_datalen += m_vars[i].size;
  2193. // for non-static target destination defined as CEAN
  2194. // expression we pass to target its size and dist
  2195. if (m_vars_extra[i].type_dst == c_cean_var) {
  2196. m_in_datalen += 2 * sizeof(uint64_t);
  2197. }
  2198. m_need_runfunction = true;
  2199. }
  2200. if (m_is_openmp && src_is_for_mic) {
  2201. if (m_vars[i].flags.is_static_dstn) {
  2202. // Static data is transferred either by omp target
  2203. // update construct which passes zeros for
  2204. // alloc_if and free_if or by always modifier.
  2205. if (!m_vars[i].flags.always_copy &&
  2206. (m_vars[i].alloc_if || m_vars[i].free_if)) {
  2207. m_vars[i].direction.bits = c_parameter_nocopy;
  2208. }
  2209. }
  2210. else {
  2211. AutoData *auto_data;
  2212. if (m_vars[i].alloc_if) {
  2213. auto_data = m_device.insert_auto_data(
  2214. m_vars[i].into, size_dst);
  2215. auto_data->add_reference();
  2216. }
  2217. else {
  2218. // TODO: what should be done if var is not in
  2219. // the table?
  2220. auto_data = m_device.find_auto_data(
  2221. m_vars[i].into);
  2222. }
  2223. // For automatic variables data is transferred:
  2224. // - if always modifier is used OR
  2225. // - if alloc_if == 0 && free_if == 0 OR
  2226. // - if reference count is 1
  2227. if (!m_vars[i].flags.always_copy &&
  2228. (m_vars[i].alloc_if || m_vars[i].free_if) &&
  2229. (auto_data == 0 ||
  2230. auto_data->get_reference() != 1)) {
  2231. m_vars[i].direction.bits = c_parameter_nocopy;
  2232. }
  2233. // save data for later use
  2234. m_vars_extra[i].auto_data = auto_data;
  2235. }
  2236. }
  2237. break;
  2238. }
  2239. case c_dv:
  2240. if (m_vars[i].direction.bits ||
  2241. m_vars[i].alloc_if ||
  2242. m_vars[i].free_if) {
  2243. ArrDesc *dvp = static_cast<ArrDesc*>(m_vars[i].into);
  2244. // debug dump
  2245. __dv_desc_dump("INTO", dvp);
  2246. // send dope vector contents excluding base
  2247. m_in_datalen += m_vars[i].size - sizeof(uint64_t);
  2248. m_need_runfunction = true;
  2249. }
  2250. break;
  2251. case c_string_ptr:
  2252. case c_data_ptr:
  2253. case c_string_ptr_ptr:
  2254. case c_data_ptr_ptr:
  2255. case c_cean_var_ptr:
  2256. case c_cean_var_ptr_ptr:
  2257. case c_dv_ptr: {
  2258. int64_t size = m_vars[i].size;
  2259. if (m_vars_extra[i].type_dst == c_cean_var_ptr ||
  2260. m_vars_extra[i].type_dst == c_cean_var_ptr_ptr) {
  2261. // array descriptor
  2262. const Arr_Desc *ap =
  2263. static_cast<const Arr_Desc*>(m_vars[i].into);
  2264. // debug dump
  2265. ARRAY_DESC_DUMP(" ", "INTO", ap, 1, src_is_for_mic);
  2266. // offset and length are derived from the array descriptor
  2267. __arr_data_offset_and_length(ap, into_disp, size);
  2268. if (!is_arr_desc_contiguous(ap)) {
  2269. m_vars[i].flags.is_noncont_src = 1;
  2270. m_vars_extra[i].read_rng_dst =
  2271. init_read_ranges_arr_desc(ap);
  2272. if (!cean_ranges_match(
  2273. m_vars_extra[i].read_rng_src,
  2274. m_vars_extra[i].read_rng_dst)) {
  2275. LIBOFFLOAD_ERROR(c_ranges_dont_match);
  2276. }
  2277. }
  2278. m_vars[i].into = reinterpret_cast<char**>(ap->base);
  2279. }
  2280. else if (m_vars_extra[i].type_dst == c_dv_ptr) {
  2281. // need to send DV to the device unless it is 'nocopy'
  2282. if (m_vars[i].direction.bits ||
  2283. m_vars[i].alloc_if ||
  2284. m_vars[i].free_if) {
  2285. ArrDesc *dvp = *static_cast<ArrDesc**>(m_vars[i].into);
  2286. // debug dump
  2287. __dv_desc_dump("INTO", dvp);
  2288. m_vars[i].direction.bits = c_parameter_in;
  2289. }
  2290. }
  2291. int64_t size_src = m_vars_extra[i].read_rng_src &&
  2292. !m_vars[i].flags.is_non_cont_struct ?
  2293. cean_get_transf_size(m_vars_extra[i].read_rng_src) :
  2294. m_vars[i].size;
  2295. int64_t size_dst = m_vars_extra[i].read_rng_dst ?
  2296. cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
  2297. size;
  2298. // It's supposed that "into" size must be not less than
  2299. // src size
  2300. if (size_src > size_dst) {
  2301. LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
  2302. size_src, size_dst);
  2303. exit(1);
  2304. }
  2305. if (m_vars[i].direction.bits) {
  2306. PtrData *ptr_data;
  2307. // base address
  2308. void *base = *static_cast<void**>(m_vars[i].into);
  2309. if (m_vars[i].direction.in) {
  2310. // allocate buffer
  2311. if (m_vars[i].flags.is_stack_buf) {
  2312. // for stack persistent objects ptr data is created
  2313. // by var_desc with number 0.
  2314. // Its ptr_data is stored at m_stack_ptr_data
  2315. ptr_data = m_stack_ptr_data;
  2316. }
  2317. else if (m_vars[i].alloc_if) {
  2318. if (m_vars[i].flags.preallocated) {
  2319. m_out_datalen += sizeof(void*);
  2320. m_need_runfunction = true;
  2321. break;
  2322. }
  2323. // add new entry
  2324. if (!alloc_ptr_data(
  2325. ptr_data,
  2326. reinterpret_cast<char *>(base) + alloc_disp,
  2327. (alloc_base != NULL) ?
  2328. alloc_disp : into_disp,
  2329. (alloc_base != NULL) ?
  2330. alloc_size : size,
  2331. alloc_disp,
  2332. (alloc_base != NULL) ?
  2333. 0 : m_vars[i].align,
  2334. m_vars[i].flags.targetptr,
  2335. m_vars[i].flags.preallocated,
  2336. m_vars[i].flags.pin)) {
  2337. return false;
  2338. }
  2339. if (m_vars[i].flags.targetptr) {
  2340. if (!init_mic_address(ptr_data)) {
  2341. return false;
  2342. }
  2343. *static_cast<void**>(m_vars[i].into) = base =
  2344. reinterpret_cast<void*>(ptr_data->mic_addr);
  2345. }
  2346. if (ptr_data->add_reference() == 0 &&
  2347. ptr_data->mic_buf != 0) {
  2348. // add buffer to the list of buffers that
  2349. // are passed to dispatch call
  2350. m_compute_buffers.push_back(
  2351. ptr_data->mic_buf);
  2352. }
  2353. else {
  2354. // will send buffer address to device
  2355. m_vars[i].flags.sink_addr = 1;
  2356. }
  2357. if (!ptr_data->is_static) {
  2358. // need to add reference for buffer
  2359. m_need_runfunction = true;
  2360. }
  2361. }
  2362. else {
  2363. // use existing association from pointer table
  2364. if (!find_ptr_data(ptr_data, base, into_disp,
  2365. size, m_vars[i].flags.targetptr, true)) {
  2366. return false;
  2367. }
  2368. m_vars[i].flags.sink_addr = 1;
  2369. }
  2370. if (ptr_data->alloc_disp != 0) {
  2371. m_vars[i].flags.alloc_disp = 1;
  2372. m_in_datalen += sizeof(alloc_disp);
  2373. }
  2374. if (m_vars[i].flags.sink_addr) {
  2375. // get buffers's address on the sink
  2376. if (!init_mic_address(ptr_data)) {
  2377. return false;
  2378. }
  2379. m_in_datalen += sizeof(ptr_data->mic_addr);
  2380. }
  2381. if (!ptr_data->is_static && m_vars[i].free_if) {
  2382. // need to decrement buffer reference on target
  2383. m_need_runfunction = true;
  2384. }
  2385. // copy other pointer properties to var descriptor
  2386. m_vars[i].mic_offset = ptr_data->mic_offset;
  2387. m_vars[i].flags.is_static_dstn = ptr_data->is_static;
  2388. }
  2389. else {
  2390. if (!find_ptr_data(ptr_data,
  2391. base,
  2392. into_disp,
  2393. m_vars[i].size,
  2394. false, false)) {
  2395. return false;
  2396. }
  2397. }
  2398. if (ptr_data) {
  2399. into_offset = ptr_data ?
  2400. (char*) base -
  2401. (char*) ptr_data->cpu_addr.start() :
  2402. 0;
  2403. }
  2404. if (m_is_openmp) {
  2405. // for FROM transfer of stack buffer's variable
  2406. if (src_is_for_mic && m_vars[i].flags.is_stack_buf) {
  2407. AutoData *auto_data;
  2408. char *base = *static_cast<char**>(m_vars[i].into);
  2409. if (m_vars[i].alloc_if) {
  2410. auto_data =m_device.insert_auto_data(
  2411. base + into_disp,
  2412. size);
  2413. auto_data->add_reference();
  2414. }
  2415. else {
  2416. auto_data = m_device.find_auto_data(
  2417. base + into_disp);
  2418. }
  2419. // save data for later use
  2420. m_vars_extra[i].auto_data = auto_data;
  2421. // For automatic variables
  2422. // data is transferred:
  2423. // - if always modifier is used OR
  2424. // - if alloc_if == 0 && free_if == 0 OR
  2425. // - if reference count is 1
  2426. if (!m_vars[i].flags.always_copy &&
  2427. (m_vars[i].alloc_if ||
  2428. m_vars[i].free_if) &&
  2429. auto_data != 0 &&
  2430. auto_data->get_reference() != 1) {
  2431. m_vars[i].direction.bits =
  2432. c_parameter_nocopy;
  2433. }
  2434. }
  2435. }
  2436. // save pointer data
  2437. m_vars_extra[i].dst_data = ptr_data;
  2438. }
  2439. break;
  2440. }
  2441. case c_func_ptr:
  2442. case c_func_ptr_ptr:
  2443. break;
  2444. case c_dv_data:
  2445. case c_dv_ptr_data:
  2446. case c_dv_data_slice:
  2447. case c_dv_ptr_data_slice:
  2448. if (m_vars[i].direction.bits ||
  2449. m_vars[i].alloc_if ||
  2450. m_vars[i].free_if) {
  2451. const Arr_Desc *ap;
  2452. ArrDesc *dvp;
  2453. PtrData *ptr_data;
  2454. int64_t disp;
  2455. int64_t size;
  2456. if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_dst)) {
  2457. ap = static_cast<const Arr_Desc*>(m_vars[i].into);
  2458. // debug dump
  2459. ARRAY_DESC_DUMP(" ", "INTO", ap, 0, src_is_for_mic);
  2460. dvp = (m_vars_extra[i].type_dst == c_dv_data_slice) ?
  2461. reinterpret_cast<ArrDesc*>(ap->base) :
  2462. *reinterpret_cast<ArrDesc**>(ap->base);
  2463. }
  2464. else {
  2465. dvp = (m_vars_extra[i].type_dst == c_dv_data) ?
  2466. static_cast<ArrDesc*>(m_vars[i].into) :
  2467. *static_cast<ArrDesc**>(m_vars[i].into);
  2468. }
  2469. if (!__dv_is_contiguous(dvp)) {
  2470. m_vars[i].flags.is_noncont_dst = 1;
  2471. m_vars_extra[i].read_rng_dst =
  2472. init_read_ranges_dv(dvp);
  2473. }
  2474. // size and displacement
  2475. if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_dst)) {
  2476. // offset and length are derived from the array
  2477. // descriptor
  2478. __arr_data_offset_and_length(ap, into_disp, size);
  2479. if (m_vars[i].direction.bits) {
  2480. if (!is_arr_desc_contiguous(ap)) {
  2481. if (m_vars[i].flags.is_noncont_dst) {
  2482. LIBOFFLOAD_ERROR(c_slice_of_noncont_array);
  2483. return false;
  2484. }
  2485. m_vars[i].flags.is_noncont_dst = 1;
  2486. m_vars_extra[i].read_rng_dst =
  2487. init_read_ranges_arr_desc(ap);
  2488. if (!cean_ranges_match(
  2489. m_vars_extra[i].read_rng_src,
  2490. m_vars_extra[i].read_rng_dst)) {
  2491. LIBOFFLOAD_ERROR(c_ranges_dont_match);
  2492. }
  2493. }
  2494. }
  2495. }
  2496. else {
  2497. if (m_vars[i].flags.has_length) {
  2498. size = __dv_data_length(dvp, m_vars[i].count);
  2499. }
  2500. else {
  2501. size = __dv_data_length(dvp);
  2502. }
  2503. disp = 0;
  2504. }
  2505. int64_t size_src =
  2506. m_vars_extra[i].read_rng_src &&
  2507. (!m_vars[i].flags.is_non_cont_struct ||
  2508. src_is_for_mic) ?
  2509. cean_get_transf_size(m_vars_extra[i].read_rng_src) :
  2510. m_vars[i].size;
  2511. int64_t size_dst =
  2512. m_vars_extra[i].read_rng_dst ?
  2513. cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
  2514. size;
  2515. // It's supposed that "into" size must be not less
  2516. // than src size
  2517. if (size_src > size_dst) {
  2518. LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
  2519. size_src, size_dst);
  2520. exit(1);
  2521. }
  2522. // base address
  2523. void *base = reinterpret_cast<void*>(dvp->Base);
  2524. // allocate buffer
  2525. if (m_vars[i].direction.in) {
  2526. if (m_vars[i].alloc_if) {
  2527. // add new entry
  2528. if (!alloc_ptr_data(
  2529. ptr_data,
  2530. reinterpret_cast<char *>(base) + alloc_disp,
  2531. (alloc_base != NULL) ?
  2532. alloc_disp : into_disp,
  2533. (alloc_base != NULL) ?
  2534. alloc_size : size,
  2535. alloc_disp,
  2536. (alloc_base != NULL) ?
  2537. 0 : m_vars[i].align,
  2538. m_vars[i].flags.targetptr,
  2539. m_vars[i].flags.preallocated,
  2540. m_vars[i].flags.pin)) {
  2541. return false;
  2542. }
  2543. if (ptr_data->add_reference() == 0 &&
  2544. ptr_data->mic_buf !=0) {
  2545. // add buffer to the list of buffers
  2546. // that are passed to dispatch call
  2547. m_compute_buffers.push_back(
  2548. ptr_data->mic_buf);
  2549. }
  2550. else {
  2551. // will send buffer address to device
  2552. m_vars[i].flags.sink_addr = 1;
  2553. }
  2554. if (!ptr_data->is_static) {
  2555. // need to add reference for buffer
  2556. m_need_runfunction = true;
  2557. }
  2558. }
  2559. else {
  2560. // use existing association from pointer table
  2561. if (!find_ptr_data(ptr_data, base, into_disp,
  2562. size, m_vars[i].flags.targetptr, true)) {
  2563. return false;
  2564. }
  2565. // need to update base in dope vector on device
  2566. m_vars[i].flags.sink_addr = 1;
  2567. }
  2568. if (ptr_data->alloc_disp != 0) {
  2569. m_vars[i].flags.alloc_disp = 1;
  2570. m_in_datalen += sizeof(alloc_disp);
  2571. }
  2572. if (m_vars[i].flags.sink_addr) {
  2573. // get buffers's address on the sink
  2574. if (!init_mic_address(ptr_data)) {
  2575. return false;
  2576. }
  2577. m_in_datalen += sizeof(ptr_data->mic_addr);
  2578. }
  2579. if (!ptr_data->is_static && m_vars[i].free_if) {
  2580. // need to decrement buffer reference on target
  2581. m_need_runfunction = true;
  2582. }
  2583. // offset to base from the beginning of the buffer
  2584. // memory
  2585. into_offset =
  2586. (char*) base - (char*) ptr_data->cpu_addr.start();
  2587. // copy other pointer properties to var descriptor
  2588. m_vars[i].mic_offset = ptr_data->mic_offset;
  2589. m_vars[i].flags.is_static_dstn = ptr_data->is_static;
  2590. }
  2591. else { // src_is_for_mic
  2592. if (!find_ptr_data(ptr_data,
  2593. base,
  2594. into_disp,
  2595. size,
  2596. false, false)) {
  2597. return false;
  2598. }
  2599. into_offset = !ptr_data ?
  2600. 0 :
  2601. (char*) base - (char*) ptr_data->cpu_addr.start();
  2602. }
  2603. // save pointer data
  2604. m_vars_extra[i].dst_data = ptr_data;
  2605. }
  2606. break;
  2607. default:
  2608. LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars_extra[i].type_src);
  2609. LIBOFFLOAD_ABORT;
  2610. }
  2611. // if into is used at CPU save its offset and disp
  2612. if (m_vars[i].direction.out) {
  2613. m_vars_extra[i].cpu_offset = into_offset;
  2614. m_vars_extra[i].cpu_disp = into_disp;
  2615. }
  2616. else {
  2617. if (m_vars[i].flags.is_stack_buf) {
  2618. if (this_threads_cpu_stack_addr == 0) {
  2619. this_threads_cpu_stack_addr =
  2620. get_this_threads_cpu_stack_addr(
  2621. stack_addr, entry_id,
  2622. thread_specific_function_locals);
  2623. }
  2624. into_offset = static_cast<char*>
  2625. (m_vars[i].into) -
  2626. this_threads_cpu_stack_addr;
  2627. }
  2628. m_vars[i].offset = into_offset;
  2629. m_vars[i].disp = into_disp;
  2630. }
  2631. }
  2632. return true;
  2633. }
  2634. bool OffloadDescriptor::setup_misc_data(const char *name)
  2635. {
  2636. OffloadTimer timer(get_timer_data(), c_offload_host_setup_misc_data);
  2637. // we can skip run functon call together with wait if offloaded
  2638. // region is empty and there is no user defined non-pointer IN/OUT data
  2639. if (m_need_runfunction) {
  2640. // variable descriptors are sent as input data
  2641. m_in_datalen += m_vars_total * sizeof(VarDesc);
  2642. // timer data is sent as a part of the output data
  2643. m_out_datalen += OFFLOAD_TIMER_DATALEN();
  2644. // max from input data and output data length
  2645. uint64_t data_len = m_in_datalen > m_out_datalen ? m_in_datalen :
  2646. m_out_datalen;
  2647. // Misc data has the following layout
  2648. // <Function Descriptor>
  2649. // <Function Name>
  2650. // <In/Out Data> (optional)
  2651. //
  2652. // We can transfer copyin/copyout data in misc/return data which can
  2653. // be passed to run function call if its size does not exceed
  2654. // COI_PIPELINE_MAX_IN_MISC_DATA_LEN. Otherwise we have to allocate
  2655. // buffer for it.
  2656. m_func_desc_size = sizeof(FunctionDescriptor) + strlen(name) + 1;
  2657. m_func_desc_size = (m_func_desc_size + 7) & ~7;
  2658. int misc_data_offset = 0;
  2659. int misc_data_size = 0;
  2660. if (data_len > 0) {
  2661. if (m_func_desc_size +
  2662. m_in_datalen <= COI_PIPELINE_MAX_IN_MISC_DATA_LEN &&
  2663. m_out_datalen <= COI_PIPELINE_MAX_IN_MISC_DATA_LEN) {
  2664. // use misc/return data for copyin/copyout
  2665. misc_data_offset = m_func_desc_size;
  2666. misc_data_size = data_len;
  2667. }
  2668. else {
  2669. OffloadTimer timer_buf(get_timer_data(),
  2670. c_offload_host_alloc_data_buffer);
  2671. // send/receive data using buffer
  2672. COIRESULT res = COI::BufferCreate(data_len,
  2673. COI_BUFFER_OPENCL,
  2674. 0, 0,
  2675. 1, &m_device.get_process(),
  2676. &m_inout_buf);
  2677. if (res != COI_SUCCESS) {
  2678. if (m_status != 0) {
  2679. m_status->result = translate_coi_error(res);
  2680. return false;
  2681. }
  2682. report_coi_error(c_buf_create, res);
  2683. }
  2684. m_compute_buffers.push_back(m_inout_buf);
  2685. m_destroy_buffers.push_back(m_inout_buf);
  2686. }
  2687. }
  2688. // initialize function descriptor
  2689. m_func_desc = (FunctionDescriptor*) malloc(m_func_desc_size +
  2690. misc_data_size);
  2691. if (m_func_desc == NULL)
  2692. LIBOFFLOAD_ERROR(c_malloc);
  2693. m_func_desc->console_enabled = console_enabled;
  2694. m_func_desc->timer_enabled = offload_report_enabled &&
  2695. (timer_enabled || offload_report_level);
  2696. m_func_desc->offload_report_level = offload_report_enabled ?
  2697. offload_report_level : 0;
  2698. m_func_desc->offload_number = GET_OFFLOAD_NUMBER(get_timer_data());
  2699. m_func_desc->in_datalen = m_in_datalen;
  2700. m_func_desc->out_datalen = m_out_datalen;
  2701. m_func_desc->vars_num = m_vars_total;
  2702. m_func_desc->data_offset = misc_data_offset;
  2703. // append entry name
  2704. strcpy(m_func_desc->data, name);
  2705. }
  2706. return true;
  2707. }
  2708. void OffloadDescriptor::setup_omp_async_info()
  2709. {
  2710. OFFLOAD_TRACE(2, "setup_omp_async_info\n");
  2711. OmpAsyncLastEventType event_type = m_need_runfunction ?
  2712. c_last_runfunc : c_last_write;
  2713. int last_in = m_need_runfunction ? 0 : -1;
  2714. int i;
  2715. for (i = m_vars_total - 1; i >=0; i--) {
  2716. bool src_is_target = (m_vars[i].direction.out || !m_vars[i].into);
  2717. int var_type = src_is_target ? m_vars_extra[i].type_src :
  2718. m_vars_extra[i].type_dst;
  2719. bool target_is_static = src_is_target ? m_vars[i].flags.is_static :
  2720. m_vars[i].flags.is_static_dstn;
  2721. switch (var_type) {
  2722. case c_data:
  2723. case c_void_ptr:
  2724. case c_cean_var:
  2725. if (m_vars[i].direction.out && target_is_static) {
  2726. event_type = c_last_read;
  2727. }
  2728. else if (last_in < 0 && m_vars[i].direction.in &&
  2729. target_is_static) {
  2730. last_in = i;
  2731. }
  2732. break;
  2733. case c_string_ptr:
  2734. case c_data_ptr:
  2735. case c_string_ptr_ptr:
  2736. case c_data_ptr_ptr:
  2737. case c_cean_var_ptr:
  2738. case c_cean_var_ptr_ptr:
  2739. case c_dv_ptr:
  2740. case c_dv_data:
  2741. case c_dv_ptr_data:
  2742. case c_dv_data_slice:
  2743. case c_dv_ptr_data_slice:
  2744. if (m_vars[i].direction.out) {
  2745. event_type = c_last_read;
  2746. }
  2747. else if (last_in < 0 && m_vars[i].direction.in) {
  2748. last_in = i;
  2749. }
  2750. break;
  2751. default:
  2752. break;
  2753. }
  2754. if (event_type == c_last_read) {
  2755. break;
  2756. }
  2757. }
  2758. if (event_type == c_last_read) {
  2759. m_vars_extra[i].omp_last_event_type = c_last_read;
  2760. }
  2761. else if (event_type == c_last_write) {
  2762. m_vars_extra[last_in].omp_last_event_type = c_last_write;
  2763. }
  2764. m_omp_async_last_event_type = event_type;
  2765. OFFLOAD_TRACE(2, "setup_omp_async_info: event_type=%d\n",
  2766. m_omp_async_last_event_type);
  2767. }
  2768. extern "C" {
  2769. void offload_proxy_task_completed_ooo(
  2770. COIEVENT e,
  2771. const COIRESULT r,
  2772. const void *info
  2773. )
  2774. {
  2775. task_completion_callback ((void *) info);
  2776. }
  2777. // Callback function for asynchronous offloads
  2778. void offload_complete_task(
  2779. COIEVENT e,
  2780. const COIRESULT r,
  2781. const void *info
  2782. )
  2783. {
  2784. Stream *stream;
  2785. OffloadDescriptor *task = const_cast<OffloadDescriptor*>(
  2786. reinterpret_cast<const OffloadDescriptor*>(info));
  2787. uint32_t events_remained;
  2788. lock_complete.lock();
  2789. if (!offload_descr_map[task]) {
  2790. lock_complete.unlock();
  2791. return;
  2792. }
  2793. #ifndef TARGET_WINNT
  2794. events_remained = __sync_sub_and_fetch(&task->m_event_count, 1);
  2795. #else // TARGET_WINNT
  2796. events_remained = _InterlockedDecrement(&task->m_event_count);
  2797. #endif // TARGET_WINNT
  2798. // Waiting for the last event
  2799. if (events_remained != 0) {
  2800. lock_complete.unlock();
  2801. return;
  2802. }
  2803. // Callback could be called when execution at host is completed.
  2804. // Do nothing as engine data is destructed
  2805. if (!task->get_device().get_ready()) {
  2806. lock_complete.unlock();
  2807. return;
  2808. }
  2809. void * signal = task->get_signal();
  2810. _Offload_stream stream_handle = task->get_stream();
  2811. OFFLOAD_TRACE(2, "Call function offload_complete_task(%p)\n", info);
  2812. // Completed offload has a signal
  2813. if (task->m_has_signal) {
  2814. if (!offload_descr_map[task]) {
  2815. lock_complete.unlock();
  2816. return;
  2817. }
  2818. task->get_device().complete_signaled_ofld(signal);
  2819. // Asynchronous offload can have both signal and stream. Need to
  2820. // clean stream if any.
  2821. stream_handle = task->get_stream();
  2822. if (stream_handle != -1) {
  2823. stream = Stream::find_stream(stream_handle, false);
  2824. if (stream && stream->get_last_offload() == task) {
  2825. stream->set_last_offload(NULL);
  2826. }
  2827. }
  2828. offload_descr_map[task] = false;
  2829. lock_complete.unlock();
  2830. if (task->offload_finish(0)) { //arg is 0 for is_traceback
  2831. task->cleanup();
  2832. }
  2833. delete task;
  2834. }
  2835. // Asynchronous by stream
  2836. else {
  2837. if (stream_handle != 0) {
  2838. stream = Stream::find_stream(stream_handle, false);
  2839. // the stream was not created or was destroyed
  2840. if (!stream) {
  2841. LIBOFFLOAD_ERROR(c_offload_no_stream,
  2842. task->get_device().get_logical_index());
  2843. LIBOFFLOAD_ABORT;
  2844. }
  2845. if (!offload_descr_map[task]) {
  2846. lock_complete.unlock();
  2847. return;
  2848. }
  2849. if (task == stream->get_last_offload()) {
  2850. stream->set_last_offload(NULL);
  2851. }
  2852. // if the offload has both signal and stream we will complete
  2853. // it as it has the signal. So we don't need to mark signal
  2854. // as completed.
  2855. offload_descr_map[task] = false;
  2856. lock_complete.unlock();
  2857. if (task->offload_finish(0)) { //arg is 0 for is_traceback
  2858. task->cleanup();
  2859. }
  2860. delete task;
  2861. }
  2862. }
  2863. }
  2864. }
  2865. void OffloadDescriptor::register_omp_event_call_back(
  2866. const COIEVENT *event,
  2867. const void *info)
  2868. {
  2869. register_event_call_back(&offload_proxy_task_completed_ooo, event, info);
  2870. }
  2871. void OffloadDescriptor::register_event_call_back(
  2872. void (*func)(COIEVENT, const COIRESULT, const void*),
  2873. const COIEVENT *event,
  2874. const void *info)
  2875. {
  2876. OFFLOAD_TRACE(2, "register_event_call_back(event=%p, info=%p)\n",
  2877. event, info);
  2878. if (COI::EventRegisterCallback) {
  2879. COI::EventRegisterCallback(
  2880. *event,
  2881. func,
  2882. info, 0);
  2883. OFFLOAD_TRACE(2,
  2884. "COI::EventRegisterCallback found; callback registered\n");
  2885. }
  2886. }
  2887. bool OffloadDescriptor::wait_dependencies(
  2888. const void **waits,
  2889. int num_waits,
  2890. _Offload_stream handle
  2891. )
  2892. {
  2893. OffloadTimer timer(get_timer_data(), c_offload_host_wait_deps);
  2894. bool ret = true;
  2895. OffloadDescriptor *task;
  2896. void * signal;
  2897. if (num_waits == 0) {
  2898. // Prepare in dependencies for stream
  2899. get_stream_in_dependencies(m_num_in_dependencies,m_p_in_dependencies);
  2900. return true;
  2901. }
  2902. // wait for streams
  2903. if (num_waits == -1) {
  2904. Stream * stream;
  2905. // some specific stream of the device
  2906. if (handle != 0) {
  2907. lock_complete.lock();
  2908. stream = Stream::find_stream(handle, false);
  2909. // the stream was not created or was destroyed
  2910. if (!stream) {
  2911. LIBOFFLOAD_ERROR(c_offload_no_stream, m_device.get_logical_index());
  2912. LIBOFFLOAD_ABORT;
  2913. }
  2914. task = stream->get_last_offload();
  2915. // offload was completed by previous offload_wait pragma
  2916. // or wait clause
  2917. if (!offload_descr_map[task]) {
  2918. lock_complete.unlock();
  2919. return true;
  2920. }
  2921. stream->set_last_offload(NULL);
  2922. if (task->m_has_signal) {
  2923. signal = task->get_signal();
  2924. if (m_device.find_signal(signal, false) == task) {
  2925. m_device.complete_signaled_ofld(signal);
  2926. }
  2927. }
  2928. offload_descr_map[task] = false;
  2929. lock_complete.unlock();
  2930. if (!task->offload_finish(0)) { //arg is 0 for is_traceback
  2931. ret = false;
  2932. }
  2933. task->cleanup();
  2934. delete task;
  2935. }
  2936. // all streams of the device or over all devices
  2937. else {
  2938. StreamMap stream_map = Stream::all_streams;
  2939. for (StreamMap::iterator it = stream_map.begin();
  2940. it != stream_map.end(); it++) {
  2941. Stream * stream = it->second;
  2942. if (!m_wait_all_devices &&
  2943. stream->get_device() != m_device.get_logical_index()) {
  2944. continue;
  2945. }
  2946. lock_complete.lock();
  2947. // get associated async task
  2948. OffloadDescriptor *task = stream->get_last_offload();
  2949. // offload was completed by offload_wait pragma or wait clause
  2950. if (!offload_descr_map[task]) {
  2951. lock_complete.unlock();
  2952. continue;
  2953. }
  2954. if (task->m_has_signal) {
  2955. signal = task->get_signal();
  2956. if (task->get_device().find_signal(signal, false) ==
  2957. task) {
  2958. task->get_device().complete_signaled_ofld(signal);
  2959. }
  2960. }
  2961. stream->set_last_offload(NULL);
  2962. offload_descr_map[task] = false;
  2963. lock_complete.unlock();
  2964. if (!task->offload_finish(0)) { //arg is 0 for is_traceback
  2965. ret = false;
  2966. }
  2967. task->cleanup();
  2968. delete task;
  2969. }
  2970. // no uncompleted streams
  2971. return true;
  2972. }
  2973. }
  2974. else {
  2975. // If offload is asynchronous we will not really wait for signals.
  2976. // We will collect all waited events into m_p_in_dependencies vector
  2977. // to be used in future calls to COI::Copy... API.
  2978. if (!__offload_always_wait && (m_has_signal || (get_stream() > 0))) {
  2979. uint64_t num_in_dep = 0,
  2980. num_in_dep_prev = 0;
  2981. COIEVENT *p_in_dep = NULL;
  2982. _Offload_stream stream_handle = get_stream();
  2983. Stream *stream;
  2984. bool stream_need_connection = stream_handle > 0;
  2985. if (stream_need_connection) {
  2986. stream = Stream::find_stream(stream_handle, false);
  2987. // check previous offload with the stream_handle
  2988. // to be noncompleted
  2989. if (!stream) {
  2990. stream_need_connection = false;
  2991. }
  2992. }
  2993. for (int i = 0; i < num_waits; i++) {
  2994. task = m_device.find_signal(waits[i], false);
  2995. if (task == 0) {
  2996. LIBOFFLOAD_ERROR(c_offload1, m_device.get_logical_index(),
  2997. waits[i]);
  2998. LIBOFFLOAD_ABORT;
  2999. }
  3000. else if (task == SIGNAL_HAS_COMPLETED) {
  3001. continue;
  3002. }
  3003. if (stream_need_connection &&
  3004. stream->get_last_offload() == task) {
  3005. stream_need_connection = false;
  3006. }
  3007. if (!task->m_num_in_dependencies) {
  3008. continue;
  3009. }
  3010. num_in_dep += task->m_num_in_dependencies;
  3011. p_in_dep = (COIEVENT*)realloc(p_in_dep,
  3012. sizeof(COIEVENT) * num_in_dep);
  3013. if (p_in_dep == NULL)
  3014. LIBOFFLOAD_ERROR(c_malloc);
  3015. memcpy(p_in_dep + num_in_dep_prev, task->m_p_in_dependencies,
  3016. task->m_num_in_dependencies * sizeof(COIEVENT));
  3017. num_in_dep_prev = num_in_dep;
  3018. }
  3019. if (stream_need_connection) {
  3020. task = stream->get_last_offload();
  3021. if (task) {
  3022. num_in_dep += task->m_num_in_dependencies;
  3023. p_in_dep = (COIEVENT*)realloc(p_in_dep,
  3024. sizeof(COIEVENT) * num_in_dep);
  3025. if (p_in_dep == NULL)
  3026. LIBOFFLOAD_ERROR(c_malloc);
  3027. memcpy(p_in_dep + num_in_dep_prev,
  3028. task->m_p_in_dependencies,
  3029. task->m_num_in_dependencies * sizeof(COIEVENT));
  3030. num_in_dep_prev = num_in_dep;
  3031. }
  3032. }
  3033. m_num_in_dependencies = num_in_dep ? num_in_dep :
  3034. m_num_in_dependencies;
  3035. m_p_in_dependencies = num_in_dep ? p_in_dep : m_p_in_dependencies;
  3036. }
  3037. // wait and do offload_finish for serial offload
  3038. else {
  3039. for (int i = 0; i < num_waits; i++) {
  3040. _Offload_stream stream_handle;
  3041. Stream *stream;
  3042. lock_complete.lock();
  3043. task = m_device.find_signal(waits[i], false);
  3044. if (task == 0) {
  3045. LIBOFFLOAD_ERROR(c_offload1, m_device.get_logical_index(),
  3046. waits[i]);
  3047. LIBOFFLOAD_ABORT;
  3048. }
  3049. else if (!offload_descr_map[task]) {
  3050. lock_complete.unlock();
  3051. continue;
  3052. }
  3053. // Need to mark signal as completed to prevent run condition
  3054. // with the call to "offload_complete_task" for the same
  3055. // signal.
  3056. m_device.complete_signaled_ofld(waits[i]);
  3057. // Asynchronous offload can have both signal and stream.
  3058. // Need to clean stream if any.
  3059. stream_handle = task->m_stream;
  3060. if (stream_handle != -1) {
  3061. stream = Stream::find_stream(stream_handle, false);
  3062. if (stream && stream->get_last_offload() == task) {
  3063. stream->set_last_offload(NULL);
  3064. }
  3065. }
  3066. offload_descr_map[task] = false;
  3067. lock_complete.unlock();
  3068. if (!task->offload_finish(0)) { //arg is 0 for is_traceback
  3069. ret = false;
  3070. }
  3071. task->cleanup();
  3072. delete task;
  3073. }
  3074. }
  3075. }
  3076. return ret;
  3077. }
  3078. bool OffloadDescriptor::offload_wrap(
  3079. const char *name,
  3080. bool is_empty,
  3081. VarDesc *vars,
  3082. VarDesc2 *vars2,
  3083. int vars_total,
  3084. const void **waits,
  3085. int num_waits,
  3086. const void **signal,
  3087. int entry_id,
  3088. const void *stack_addr,
  3089. OffloadFlags offload_flags
  3090. )
  3091. {
  3092. OffloadWaitKind wait_kind = c_offload_wait_signal;
  3093. bool is_traceback = offload_flags.bits.fortran_traceback;
  3094. // define kind of wait if any;
  3095. // there can be one of the following kind:
  3096. // 1. c_offload_wait_signal for "offload_wait wait(signal)"
  3097. // 2. c_offload_wait_stream for "offload_wait stream(stream)"
  3098. // 3. c_offload_wait_all_streams for "offload_wait stream(0)"
  3099. if (num_waits == -1) {
  3100. wait_kind = (m_stream == 0) ?
  3101. c_offload_wait_all_streams :
  3102. c_offload_wait_stream;
  3103. }
  3104. char buf[35];
  3105. const char *stream_str;
  3106. if (m_stream == no_stream || num_waits ==-1) {
  3107. stream_str = "none";
  3108. }
  3109. else if (m_stream == 0) {
  3110. stream_str = "all";
  3111. }
  3112. else {
  3113. sprintf(buf, "%#llx", m_stream);
  3114. stream_str = buf;
  3115. }
  3116. if (m_has_signal) {
  3117. OFFLOAD_DEBUG_TRACE_1(1,
  3118. GET_OFFLOAD_NUMBER(get_timer_data()),
  3119. c_offload_init_func,
  3120. "Offload function %s, is_empty=%d, #varDescs=%d, "
  3121. "signal=none, stream=%s, #waits=%d%c",
  3122. name, is_empty, vars_total, stream_str, num_waits,
  3123. num_waits == 0 ? '\n' : ' ');
  3124. // Breaks the norm of using OFFLOAD_DEBUG_TRACE to print the waits
  3125. // since the number of waits is not fixed.
  3126. if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) {
  3127. if (num_waits) {
  3128. printf("(");
  3129. if (m_stream == no_stream) {
  3130. printf("%p", waits[0]);
  3131. for (int i = 1; i < num_waits; i++) {
  3132. printf(", %p", waits[i]);
  3133. }
  3134. }
  3135. else if (m_stream != 0) {
  3136. printf("%#x", m_stream);
  3137. }
  3138. else {
  3139. printf(" all streams");
  3140. }
  3141. printf(")");
  3142. }
  3143. printf("\n");
  3144. fflush(NULL);
  3145. }
  3146. // stream in wait is reported further in OFFLOAD_REPORT for waits
  3147. if (m_stream != no_stream && num_waits == 0) {
  3148. OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
  3149. c_offload_stream,
  3150. "%d\n", m_stream);
  3151. }
  3152. OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
  3153. c_offload_signal,
  3154. "none %d\n", 0);
  3155. }
  3156. else {
  3157. OFFLOAD_DEBUG_TRACE_1(1,
  3158. GET_OFFLOAD_NUMBER(get_timer_data()),
  3159. c_offload_init_func,
  3160. "Offload function %s, is_empty=%d, #varDescs=%d, "
  3161. "signal=%p, stream=%s, #waits=%d%c",
  3162. name, is_empty, vars_total, signal, stream_str,
  3163. num_waits, num_waits == 0 ? '\n' : ' ');
  3164. // Breaks the norm of using OFFLOAD_DEBUG_TRACE to print the waits
  3165. // since the number of waits is not fixed.
  3166. if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) {
  3167. if (num_waits) {
  3168. printf("(");
  3169. if (m_stream == no_stream) {
  3170. printf("%p", waits[0]);
  3171. for (int i = 1; i < num_waits; i++) {
  3172. printf(", %p", waits[i]);
  3173. }
  3174. printf(")");
  3175. }
  3176. else if (m_stream != 0) {
  3177. printf("%#x", m_stream);
  3178. }
  3179. else {
  3180. printf(" all streams");
  3181. }
  3182. printf(")");
  3183. }
  3184. printf("\n");
  3185. fflush(NULL);
  3186. }
  3187. // stream in wait is reported further in OFFLOAD_REPORT for waits
  3188. if (m_stream != no_stream && num_waits == 0) {
  3189. OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
  3190. c_offload_stream,
  3191. "%d\n", m_stream);
  3192. }
  3193. OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
  3194. c_offload_signal,
  3195. "%d\n", signal);
  3196. }
  3197. if (console_enabled >= 1 && offload_flags.flags != 0) {
  3198. trace_offload_flags(get_timer_data(), offload_flags);
  3199. }
  3200. OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
  3201. c_offload_wait, "%d\n",
  3202. wait_kind, num_waits,
  3203. (wait_kind == c_offload_wait_signal) ?
  3204. waits :
  3205. reinterpret_cast<const void **>(m_stream));
  3206. if (m_status != 0) {
  3207. m_status->result = OFFLOAD_SUCCESS;
  3208. m_status->device_number = m_device.get_logical_index();
  3209. }
  3210. m_initial_need_runfunction = m_need_runfunction = !is_empty;
  3211. // wait for dependencies to finish or set
  3212. // m_num_in_dependencies and m_p_in_dependencies for asynchronous offload
  3213. if (!wait_dependencies(waits, num_waits, m_stream)) {
  3214. cleanup();
  3215. return false;
  3216. }
  3217. // setup buffers
  3218. if (!setup_descriptors(vars, vars2, vars_total, entry_id, stack_addr)) {
  3219. cleanup();
  3220. return false;
  3221. }
  3222. if (offload_flags.bits.omp_async) {
  3223. setup_omp_async_info();
  3224. }
  3225. // initiate send for pointers. Want to do it as early as possible.
  3226. if (!send_pointer_data(signal != 0 || offload_flags.bits.omp_async,
  3227. signal)) {
  3228. cleanup();
  3229. return false;
  3230. }
  3231. // setup misc data for run function
  3232. if (!setup_misc_data(name)) {
  3233. cleanup();
  3234. return false;
  3235. }
  3236. // gather copyin data into buffer
  3237. if (!gather_copyin_data()) {
  3238. cleanup();
  3239. return false;
  3240. }
  3241. // Start the computation
  3242. if (!compute(signal)) {
  3243. cleanup();
  3244. return false;
  3245. }
  3246. // initiate receive for pointers
  3247. if (!receive_pointer_data(signal != 0 || offload_flags.bits.omp_async,
  3248. true, signal)) {
  3249. cleanup();
  3250. return false;
  3251. }
  3252. if (offload_flags.bits.omp_async) {
  3253. return true;
  3254. }
  3255. // if there is a signal or stream save descriptor for the later use.
  3256. // num_waits == -1 is for offload_wait and there is nothing to save
  3257. if (num_waits != -1 && (signal != 0 || m_stream != no_stream)) {
  3258. if (signal != 0) {
  3259. m_device.add_signal(*signal, this);
  3260. }
  3261. if (m_stream != no_stream && m_stream != 0) {
  3262. Stream* stream = Stream::find_stream(m_stream, false);
  3263. if (stream) {
  3264. stream->set_last_offload(this);
  3265. }
  3266. else {
  3267. LIBOFFLOAD_ERROR(c_offload_no_stream, m_device.get_logical_index());
  3268. LIBOFFLOAD_ABORT;
  3269. }
  3270. }
  3271. // Register callback function "offload_complete_task" for all out
  3272. // events or for all in events if there are no out transfers
  3273. if (!m_preallocated_alloc) {
  3274. m_event_count = m_out_deps_total ?
  3275. m_out_deps_total : m_in_deps_total;
  3276. COIEVENT *event_list = m_out_deps_total ? m_out_deps : m_in_deps;
  3277. for (int i = 0; i < m_event_count; i++) {
  3278. register_event_call_back(&offload_complete_task,
  3279. &event_list[i], this);
  3280. }
  3281. offload_descr_map[this] = true;
  3282. return true;
  3283. }
  3284. }
  3285. // wait for the offload to finish.
  3286. if (!offload_finish(is_traceback)) {
  3287. cleanup();
  3288. return false;
  3289. }
  3290. cleanup();
  3291. return true;
  3292. }
  3293. bool OffloadDescriptor::offload(
  3294. const char *name,
  3295. bool is_empty,
  3296. VarDesc *vars,
  3297. VarDesc2 *vars2,
  3298. int vars_total,
  3299. const void **waits,
  3300. int num_waits,
  3301. const void **signal,
  3302. int entry_id,
  3303. const void *stack_addr,
  3304. OffloadFlags offload_flags
  3305. )
  3306. {
  3307. bool res;
  3308. res = offload_wrap(name, is_empty, vars, vars2, vars_total,
  3309. waits, num_waits, signal, entry_id,
  3310. stack_addr, offload_flags);
  3311. if (res == false && !m_traceback_called) {
  3312. if (offload_flags.bits.fortran_traceback) {
  3313. OFFLOAD_TRACE(3,
  3314. "Calling Fortran library to continue traceback from MIC\n");
  3315. FORTRAN_TRACE_BACK(m_status->result);
  3316. m_traceback_called = true;
  3317. }
  3318. }
  3319. return res;
  3320. }
  3321. bool OffloadDescriptor::offload_finish(
  3322. bool is_traceback
  3323. )
  3324. {
  3325. COIRESULT res;
  3326. // wait for compute dependencies to become signaled
  3327. if (m_in_deps_total > 0 &&
  3328. (m_out_deps_total <= 0 || m_preallocated_alloc)) {
  3329. OffloadTimer timer(get_timer_data(), c_offload_host_wait_compute);
  3330. if (__offload_active_wait) {
  3331. // keep CPU busy
  3332. do {
  3333. res = COI::EventWait(m_in_deps_total, m_in_deps, 0, 1, 0, 0);
  3334. }
  3335. while (res == COI_TIME_OUT_REACHED);
  3336. }
  3337. else {
  3338. res = COI::EventWait(m_in_deps_total, m_in_deps, -1, 1, 0, 0);
  3339. }
  3340. if (res != COI_SUCCESS) {
  3341. if (m_status != 0 && !m_traceback_called) {
  3342. m_status->result = translate_coi_error(res);
  3343. if (is_traceback) {
  3344. OFFLOAD_TRACE(3,
  3345. "Calling Fortran library to continue traceback from MIC\n");
  3346. FORTRAN_TRACE_BACK(m_status->result);
  3347. m_traceback_called = true;
  3348. }
  3349. return false;
  3350. }
  3351. if (is_traceback && !m_traceback_called) {
  3352. OFFLOAD_TRACE(3,
  3353. "Calling Fortran library to continue traceback from MIC\n");
  3354. FORTRAN_TRACE_BACK(OFFLOAD_ERROR);
  3355. exit(1);
  3356. }
  3357. report_coi_error(c_event_wait, res);
  3358. }
  3359. }
  3360. // need to do scatter copyout data received from target after
  3361. // completing in dependencies to get preallocated buffers.
  3362. // If there are no preallocated buffers we will scatter_copyout_data
  3363. // after completing out dependencies. In this case we dont need wait
  3364. // in dependencies as they are already in DAG.
  3365. if (m_out_with_preallocated) {
  3366. if (!scatter_copyout_data()) {
  3367. return false;
  3368. }
  3369. if (!receive_pointer_data(m_out_deps_total > 0, false, NULL)) {
  3370. cleanup();
  3371. return false;
  3372. }
  3373. }
  3374. // wait for receive dependencies to become signaled
  3375. if (m_out_deps_total > 0) {
  3376. OffloadTimer timer(get_timer_data(), c_offload_host_wait_buffers_reads);
  3377. if (__offload_active_wait) {
  3378. // keep CPU busy
  3379. do {
  3380. res = COI::EventWait(m_out_deps_total, m_out_deps, 0, 1, 0, 0);
  3381. }
  3382. while (res == COI_TIME_OUT_REACHED);
  3383. }
  3384. else {
  3385. res = COI::EventWait(m_out_deps_total, m_out_deps, -1, 1, 0, 0);
  3386. }
  3387. if (res != COI_SUCCESS) {
  3388. if (m_status != 0 && !m_traceback_called) {
  3389. m_status->result = translate_coi_error(res);
  3390. if (is_traceback) {
  3391. OFFLOAD_TRACE(3,
  3392. "Calling Fortran library to continue traceback from MIC\n");
  3393. FORTRAN_TRACE_BACK(m_status->result);
  3394. m_traceback_called = true;
  3395. }
  3396. return false;
  3397. }
  3398. if (is_traceback && !m_traceback_called) {
  3399. OFFLOAD_TRACE(3,
  3400. "Calling Fortran library to continue traceback from MIC\n");
  3401. FORTRAN_TRACE_BACK(OFFLOAD_ERROR);
  3402. exit(1);
  3403. }
  3404. report_coi_error(c_event_wait, res);
  3405. }
  3406. }
  3407. if (!m_out_with_preallocated && !scatter_copyout_data()) {
  3408. return false;
  3409. }
  3410. // destroy buffers
  3411. {
  3412. OffloadTimer timer(get_timer_data(), c_offload_host_destroy_buffers);
  3413. for (BufferList::const_iterator it = m_destroy_buffers.begin();
  3414. it != m_destroy_buffers.end(); it++) {
  3415. res = COI::BufferDestroy(*it);
  3416. if (res != COI_SUCCESS) {
  3417. if (m_status != 0) {
  3418. m_status->result = translate_coi_error(res);
  3419. return false;
  3420. }
  3421. report_coi_error(c_buf_destroy, res);
  3422. }
  3423. }
  3424. }
  3425. return true;
  3426. }
  3427. void OffloadDescriptor::cleanup()
  3428. {
  3429. // release device in orsl
  3430. ORSL::release(m_device.get_logical_index());
  3431. OFFLOAD_TIMER_STOP(get_timer_data(), c_offload_host_total_offload);
  3432. // report stuff
  3433. Offload_Report_Epilog(get_timer_data());
  3434. }
  3435. bool OffloadDescriptor::is_signaled()
  3436. {
  3437. bool signaled = true;
  3438. COIRESULT res;
  3439. // check compute and receive dependencies
  3440. if (m_out_deps_total > 0) {
  3441. res = COI::EventWait(m_out_deps_total, m_out_deps, 0, 1, 0, 0);
  3442. signaled = signaled && (res == COI_SUCCESS);
  3443. }
  3444. else if (m_in_deps_total > 0) {
  3445. res = COI::EventWait(m_in_deps_total, m_in_deps, 0, 1, 0, 0);
  3446. signaled = signaled && (res == COI_SUCCESS);
  3447. }
  3448. return signaled;
  3449. }
  3450. static Arr_Desc * make_arr_desc(
  3451. void* ptr_val,
  3452. int64_t extent_start_val,
  3453. int64_t extent_elements_val,
  3454. int64_t size
  3455. )
  3456. {
  3457. Arr_Desc *res;
  3458. res = (Arr_Desc *)malloc(sizeof(Arr_Desc));
  3459. if (res == NULL)
  3460. LIBOFFLOAD_ERROR(c_malloc);
  3461. res->base = reinterpret_cast<int64_t>(ptr_val);
  3462. res->rank = 1;
  3463. res->dim[0].size = size;
  3464. res->dim[0].lindex = 0;
  3465. res->dim[0].lower = extent_start_val;
  3466. res->dim[0].upper = extent_elements_val + extent_start_val - 1;
  3467. res->dim[0].stride = 1;
  3468. return res;
  3469. }
  3470. // Send pointer data if source or destination or both of them are
  3471. // noncontiguous. There is guarantee that length of destination enough for
  3472. // transferred data.
  3473. bool OffloadDescriptor::send_noncontiguous_pointer_data(
  3474. int i,
  3475. PtrData* src_data,
  3476. PtrData* dst_data,
  3477. COIEVENT *event,
  3478. uint64_t &data_sent,
  3479. uint32_t in_deps_amount,
  3480. COIEVENT *in_deps
  3481. )
  3482. {
  3483. NonContigDesc *desc;
  3484. int noncont_num;
  3485. int64_t offset_src, offset_dst;
  3486. int64_t length_src, length_dst;
  3487. int64_t length_src_cur, length_dst_cur;
  3488. int64_t send_size;
  3489. COIRESULT res;
  3490. bool dst_is_empty = true;
  3491. bool src_is_empty = true;
  3492. // If BufferWriteMultiD is defined we can set values of required arguments
  3493. // and transfer noncontiguous data via call to the COI routine.
  3494. if (!m_vars[i].flags.is_non_cont_struct &&
  3495. __offload_use_coi_noncontiguous_transfer && COI::BufferWriteMultiD) {
  3496. struct Arr_Desc* arr_desc_dst;
  3497. struct Arr_Desc* arr_desc_src;
  3498. int64_t size_src, size_dst;
  3499. char *base = offload_get_src_base(static_cast<char*>(m_vars[i].ptr),
  3500. m_vars_extra[i].type_src);
  3501. COIBUFFER dst_buf = m_vars[i].into ?
  3502. m_vars_extra[i].dst_data->mic_buf :
  3503. m_vars_extra[i].src_data->mic_buf;
  3504. offset_src = (m_vars_extra[i].read_rng_src)?
  3505. m_vars_extra[i].read_rng_src->init_offset : m_vars_extra[i].cpu_disp;
  3506. size_src = m_vars_extra[i].read_rng_src ?
  3507. cean_get_transf_size(m_vars_extra[i].read_rng_src) :
  3508. m_vars[i].size;
  3509. offset_dst = (m_vars_extra[i].read_rng_dst)?
  3510. m_vars_extra[i].read_rng_dst->init_offset : m_vars[i].disp;
  3511. size_dst = m_vars_extra[i].read_rng_dst ?
  3512. cean_get_transf_size(m_vars_extra[i].read_rng_dst) : m_vars[i].size;
  3513. int64_t el_size = (!m_vars[i].into ||
  3514. (m_vars_extra[i].read_rng_src && m_vars_extra[i].read_rng_dst)) ?
  3515. 1 :
  3516. m_vars_extra[i].read_rng_src ?
  3517. m_vars_extra[i].read_rng_src->arr_desc->dim[
  3518. m_vars_extra[i].read_rng_src->arr_desc->rank - 1].size :
  3519. m_vars_extra[i].read_rng_dst->arr_desc->dim[
  3520. m_vars_extra[i].read_rng_dst->arr_desc->rank - 1].size;
  3521. arr_desc_src = (m_vars_extra[i].read_rng_src) ?
  3522. m_vars_extra[i].read_rng_src->arr_desc :
  3523. make_arr_desc(NULL, // don't required for source
  3524. offset_src/el_size, size_src/el_size, el_size);
  3525. arr_desc_dst = !m_vars[i].into ?
  3526. arr_desc_src :
  3527. (m_vars_extra[i].read_rng_dst) ?
  3528. m_vars_extra[i].read_rng_dst->arr_desc :
  3529. make_arr_desc(NULL,
  3530. offset_dst/el_size, size_src/el_size, el_size);
  3531. int64_t alloc_disp = m_vars[i].into ?
  3532. m_vars_extra[i].dst_data->alloc_disp :
  3533. m_vars_extra[i].src_data->alloc_disp;
  3534. arr_desc_dst->base = 0;
  3535. arr_desc_src->base = reinterpret_cast<int64_t>(base);
  3536. res = COI::BufferWriteMultiD(
  3537. dst_buf, // in_DestBuffer,
  3538. NULL, // DestProcess,
  3539. m_vars[i].offset + m_vars[i].mic_offset -
  3540. alloc_disp, // Offset
  3541. (void*)arr_desc_dst, // descriptor of DestArray
  3542. (void*)arr_desc_src, // descriptor of SrcArray
  3543. COI_COPY_UNSPECIFIED, // Type
  3544. m_num_in_dependencies, // Number of in Dependencies
  3545. m_p_in_dependencies, // array of in Dependencies
  3546. event); // out Dependency
  3547. if (res != COI_SUCCESS) {
  3548. if (m_status != 0) {
  3549. m_status->result = translate_coi_error(res);
  3550. return false;
  3551. }
  3552. report_coi_error(c_buf_copy, res);
  3553. }
  3554. return(true);
  3555. }
  3556. data_sent = 0;
  3557. if (m_vars[i].flags.is_non_cont_struct) {
  3558. desc = m_vars_extra[i].noncont_desc;
  3559. noncont_num = 0;
  3560. }
  3561. else {
  3562. // Set length_src and length_dst
  3563. length_src = (m_vars_extra[i].read_rng_src) ?
  3564. m_vars_extra[i].read_rng_src->range_size : m_vars[i].size;
  3565. length_dst = !m_vars[i].into ? length_src :
  3566. (m_vars_extra[i].read_rng_dst) ?
  3567. m_vars_extra[i].read_rng_dst->range_size :
  3568. m_vars[i].size;
  3569. send_size = (length_src < length_dst) ? length_src : length_dst;
  3570. }
  3571. // if event is defined we must multiplate it for all contiguous ranges
  3572. // that will be Copied/Write.
  3573. // Take in account that we already have 1 event.
  3574. if (event) {
  3575. uint32_t range_num = m_vars[i].flags.is_non_cont_struct ?
  3576. desc->interval_cnt :
  3577. (length_src / send_size) *
  3578. ((m_vars_extra[i].read_rng_src) ?
  3579. m_vars_extra[i].read_rng_src->range_max_number : 1) ;
  3580. m_in_deps_allocated += range_num ;
  3581. m_in_deps =
  3582. (COIEVENT*)realloc(m_in_deps, sizeof(COIEVENT) * m_in_deps_allocated);
  3583. m_in_deps_total--;
  3584. }
  3585. // consequently get contiguous ranges,
  3586. // define corresponded destination offset and send data
  3587. do {
  3588. if (m_vars[i].flags.is_non_cont_struct) {
  3589. // ranges are over
  3590. if (noncont_num >= desc->interval_cnt) {
  3591. break;
  3592. }
  3593. offset_src = offset_dst = desc->interval[noncont_num].lower;
  3594. send_size = desc->interval[noncont_num].size;
  3595. noncont_num++;
  3596. }
  3597. else {
  3598. if (src_is_empty) {
  3599. if (m_vars_extra[i].read_rng_src) {
  3600. if (!get_next_range(m_vars_extra[i].read_rng_src,
  3601. &offset_src)) {
  3602. // source ranges are over - nothing to send
  3603. break;
  3604. }
  3605. }
  3606. else if (data_sent == 0) {
  3607. offset_src = m_vars_extra[i].cpu_disp;
  3608. }
  3609. else {
  3610. break;
  3611. }
  3612. length_src_cur = length_src;
  3613. }
  3614. else {
  3615. // if source is contiguous or its contiguous range is greater
  3616. // than destination one
  3617. offset_src += send_size;
  3618. }
  3619. length_src_cur -= send_size;
  3620. src_is_empty = length_src_cur == 0;
  3621. if (dst_is_empty) {
  3622. if (m_vars[i].into) {
  3623. if (m_vars_extra[i].read_rng_dst) {
  3624. if (!get_next_range(m_vars_extra[i].read_rng_dst,
  3625. &offset_dst)) {
  3626. // destination ranges are over
  3627. LIBOFFLOAD_ERROR(c_destination_is_over);
  3628. return false;
  3629. }
  3630. }
  3631. // into is contiguous.
  3632. else {
  3633. offset_dst = m_vars[i].disp;
  3634. }
  3635. length_dst_cur = length_dst;
  3636. }
  3637. // same as source
  3638. else {
  3639. offset_dst = offset_src;
  3640. length_dst_cur = length_src;
  3641. }
  3642. }
  3643. else {
  3644. // if destination is contiguous or its contiguous range is greater
  3645. // than source one
  3646. offset_dst += send_size;
  3647. }
  3648. length_dst_cur -= send_size;
  3649. dst_is_empty = length_dst_cur == 0;
  3650. }
  3651. if (event) {
  3652. event = &m_in_deps[m_in_deps_total++];
  3653. }
  3654. if (src_data != 0 && src_data->cpu_buf != 0) {
  3655. res = COI::BufferCopy(
  3656. dst_data->mic_buf,
  3657. src_data->cpu_buf,
  3658. m_vars[i].mic_offset +
  3659. m_vars[i].offset + offset_dst,
  3660. m_vars_extra[i].cpu_offset + offset_src,
  3661. send_size,
  3662. COI_COPY_UNSPECIFIED,
  3663. m_num_in_dependencies,
  3664. m_p_in_dependencies,
  3665. event);
  3666. if (res != COI_SUCCESS) {
  3667. if (m_status != 0) {
  3668. m_status->result = translate_coi_error(res);
  3669. return false;
  3670. }
  3671. report_coi_error(c_buf_copy, res);
  3672. }
  3673. }
  3674. else {
  3675. char *base = offload_get_src_base(m_vars[i].ptr,
  3676. m_vars_extra[i].type_src);
  3677. res = COI::BufferWrite(
  3678. dst_data->mic_buf,
  3679. m_vars[i].mic_offset +
  3680. m_vars[i].offset + offset_dst,
  3681. base + offset_src,
  3682. send_size,
  3683. COI_COPY_UNSPECIFIED,
  3684. m_num_in_dependencies,
  3685. m_p_in_dependencies,
  3686. event);
  3687. if (res != COI_SUCCESS) {
  3688. if (m_status != 0) {
  3689. m_status->result = translate_coi_error(res);
  3690. return false;
  3691. }
  3692. report_coi_error(c_buf_write, res);
  3693. }
  3694. }
  3695. data_sent += send_size;
  3696. }
  3697. while (true);
  3698. return true;
  3699. }
  3700. bool OffloadDescriptor::send_pointer_data(bool is_async, void* info)
  3701. {
  3702. OffloadTimer timer(get_timer_data(), c_offload_host_send_pointers);
  3703. bool should_use_async_buffer_write = m_initial_need_runfunction;
  3704. uint64_t ptr_sent = 0;
  3705. COIRESULT res;
  3706. uint32_t in_deps_amount = 0;
  3707. COIEVENT *in_deps = NULL;
  3708. // For offload_transfer and offload with empty body without signal:
  3709. // - if there is only one buffer copy - send data synchronously
  3710. // - if there are multiple buffer copy and
  3711. // __offload_parallel_copy is false - send data synchronously
  3712. // - if there are multiple buffer copy and
  3713. // __offload_parallel_copy is true - send data asynchronously
  3714. // It concerns only big size data - greater than __offload_use_async_buffer_write.
  3715. // Data of size less than __offload_use_async_buffer_write are sent synchronously.
  3716. // Synchronous transfer results in better performance in COI.
  3717. // __offload_parallel_copy is false by default but can be changed
  3718. // via environment variable OFFLOAD_PARALLEL_COPY
  3719. if (!m_initial_need_runfunction && __offload_parallel_copy) {
  3720. int big_size_count = 0;
  3721. for (int i = 0; i < m_vars_total; i++) {
  3722. if (m_vars[i].direction.in &&
  3723. m_vars[i].size >= __offload_use_async_buffer_write) {
  3724. switch (m_vars_extra[i].type_dst) {
  3725. case c_data:
  3726. case c_void_ptr:
  3727. case c_void_ptr_ptr:
  3728. case c_cean_var:
  3729. if (m_vars[i].flags.is_static_dstn) {
  3730. big_size_count++;
  3731. }
  3732. break;
  3733. case c_string_ptr:
  3734. case c_string_ptr_ptr:
  3735. case c_data_ptr:
  3736. case c_data_ptr_ptr:
  3737. case c_cean_var_ptr:
  3738. case c_cean_var_ptr_ptr:
  3739. case c_dv_ptr:
  3740. case c_dv_data:
  3741. case c_dv_ptr_data:
  3742. case c_dv_data_slice:
  3743. case c_dv_ptr_data_slice:
  3744. big_size_count++;
  3745. break;
  3746. default:
  3747. break;
  3748. }
  3749. }
  3750. }
  3751. if (big_size_count > 1) {
  3752. should_use_async_buffer_write = true;
  3753. }
  3754. }
  3755. // Initiate send for pointer data
  3756. for (int i = 0; i < m_vars_total; i++) {
  3757. uint64_t sent_data = m_vars[i].size;
  3758. if (m_vars_extra[i].omp_last_event_type == c_last_write &&
  3759. m_in_deps_total > 0) {
  3760. m_num_in_dependencies = m_in_deps_total;
  3761. m_p_in_dependencies = m_in_deps;
  3762. }
  3763. switch (m_vars_extra[i].type_dst) {
  3764. case c_data_ptr_array:
  3765. break;
  3766. case c_data:
  3767. case c_void_ptr:
  3768. case c_void_ptr_ptr:
  3769. case c_cean_var:
  3770. if (m_vars[i].direction.in &&
  3771. m_vars[i].flags.is_static_dstn) {
  3772. COIEVENT *event =
  3773. (m_stream != no_stream ||
  3774. is_async ||
  3775. (should_use_async_buffer_write &&
  3776. m_vars[i].size >= __offload_use_async_buffer_write)) ?
  3777. &m_in_deps[m_in_deps_total++] : 0;
  3778. PtrData* dst_data = m_vars[i].into ?
  3779. m_vars_extra[i].dst_data :
  3780. m_vars_extra[i].src_data;
  3781. PtrData* src_data =
  3782. VAR_TYPE_IS_PTR(m_vars_extra[i].type_src) ||
  3783. VAR_TYPE_IS_SCALAR(m_vars_extra[i].type_src) &&
  3784. m_vars[i].flags.is_static ?
  3785. m_vars_extra[i].src_data : 0;
  3786. if (m_vars[i].flags.is_non_cont_struct ||
  3787. m_vars[i].flags.is_noncont_src ||
  3788. m_vars[i].flags.is_noncont_dst) {
  3789. if (!send_noncontiguous_pointer_data(
  3790. i, src_data, dst_data, event, sent_data,
  3791. m_num_in_dependencies, m_p_in_dependencies)) {
  3792. return false;
  3793. }
  3794. }
  3795. else if (src_data != 0 && src_data->cpu_buf != 0) {
  3796. res = COI::BufferCopy(
  3797. dst_data->mic_buf,
  3798. src_data->cpu_buf,
  3799. m_vars[i].mic_offset +
  3800. m_vars[i].offset + m_vars[i].disp,
  3801. m_vars_extra[i].cpu_offset +
  3802. m_vars_extra[i].cpu_disp,
  3803. m_vars[i].size,
  3804. COI_COPY_UNSPECIFIED,
  3805. m_num_in_dependencies,
  3806. m_p_in_dependencies,
  3807. event);
  3808. if (res != COI_SUCCESS) {
  3809. if (m_status != 0) {
  3810. m_status->result = translate_coi_error(res);
  3811. return false;
  3812. }
  3813. report_coi_error(c_buf_copy, res);
  3814. }
  3815. }
  3816. else {
  3817. char *base = offload_get_src_base(m_vars[i].ptr,
  3818. m_vars_extra[i].type_src);
  3819. res = COI::BufferWrite(
  3820. dst_data->mic_buf,
  3821. m_vars[i].mic_offset +
  3822. m_vars[i].offset + m_vars[i].disp,
  3823. base + m_vars_extra[i].cpu_disp,
  3824. m_vars[i].size,
  3825. COI_COPY_UNSPECIFIED,
  3826. m_num_in_dependencies,
  3827. m_p_in_dependencies,
  3828. event);
  3829. if (res != COI_SUCCESS) {
  3830. if (m_status != 0) {
  3831. m_status->result = translate_coi_error(res);
  3832. return false;
  3833. }
  3834. report_coi_error(c_buf_write, res);
  3835. }
  3836. }
  3837. ptr_sent += sent_data;
  3838. }
  3839. break;
  3840. case c_data_ptr:
  3841. // If use_device_ptr no data needs to be sent
  3842. if (m_vars[i].flags.use_device_ptr) {
  3843. break;
  3844. }
  3845. case c_string_ptr:
  3846. case c_string_ptr_ptr:
  3847. case c_data_ptr_ptr:
  3848. case c_cean_var_ptr:
  3849. case c_cean_var_ptr_ptr:
  3850. case c_dv_ptr:
  3851. if (m_vars[i].direction.in && m_vars[i].size > 0) {
  3852. COIEVENT *event =
  3853. (m_stream != no_stream ||
  3854. is_async ||
  3855. (should_use_async_buffer_write &&
  3856. m_vars[i].size >= __offload_use_async_buffer_write)) ?
  3857. &m_in_deps[m_in_deps_total++] : 0;
  3858. PtrData* dst_data = m_vars[i].into ?
  3859. m_vars_extra[i].dst_data :
  3860. m_vars_extra[i].src_data;
  3861. PtrData* src_data =
  3862. VAR_TYPE_IS_PTR(m_vars_extra[i].type_src) ||
  3863. VAR_TYPE_IS_SCALAR(m_vars_extra[i].type_src) &&
  3864. m_vars[i].flags.is_static ?
  3865. m_vars_extra[i].src_data : 0;
  3866. if (m_vars[i].flags.is_non_cont_struct ||
  3867. m_vars[i].flags.is_noncont_src ||
  3868. m_vars[i].flags.is_noncont_dst) {
  3869. send_noncontiguous_pointer_data(
  3870. i, src_data, dst_data, event, sent_data,
  3871. in_deps_amount, in_deps);
  3872. }
  3873. else if (src_data != 0 && src_data->cpu_buf != 0) {
  3874. res = COI::BufferCopy(
  3875. dst_data->mic_buf,
  3876. src_data->cpu_buf,
  3877. m_vars[i].mic_offset +
  3878. m_vars[i].offset + m_vars[i].disp,
  3879. m_vars_extra[i].cpu_offset +
  3880. m_vars_extra[i].cpu_disp,
  3881. m_vars[i].size,
  3882. COI_COPY_UNSPECIFIED,
  3883. m_num_in_dependencies,
  3884. m_p_in_dependencies,
  3885. event);
  3886. if (res != COI_SUCCESS) {
  3887. if (m_status != 0) {
  3888. m_status->result = translate_coi_error(res);
  3889. return false;
  3890. }
  3891. report_coi_error(c_buf_copy, res);
  3892. }
  3893. }
  3894. else {
  3895. char *base = offload_get_src_base(m_vars[i].ptr,
  3896. m_vars_extra[i].type_src);
  3897. res = COI::BufferWrite(
  3898. dst_data->mic_buf,
  3899. m_vars[i].mic_offset +
  3900. m_vars[i].offset + m_vars[i].disp,
  3901. base + m_vars_extra[i].cpu_disp,
  3902. m_vars[i].size,
  3903. COI_COPY_UNSPECIFIED,
  3904. m_num_in_dependencies,
  3905. m_p_in_dependencies,
  3906. event);
  3907. if (res != COI_SUCCESS) {
  3908. if (m_status != 0) {
  3909. m_status->result = translate_coi_error(res);
  3910. return false;
  3911. }
  3912. report_coi_error(c_buf_write, res);
  3913. }
  3914. }
  3915. ptr_sent += sent_data;
  3916. }
  3917. break;
  3918. case c_dv_data:
  3919. case c_dv_ptr_data:
  3920. if (m_vars[i].direction.in &&
  3921. m_vars[i].size > 0) {
  3922. PtrData *ptr_data = m_vars[i].into ?
  3923. m_vars_extra[i].dst_data :
  3924. m_vars_extra[i].src_data;
  3925. PtrData* src_data = m_vars_extra[i].src_data;
  3926. COIEVENT *event =
  3927. (m_stream != no_stream ||
  3928. is_async ||
  3929. (should_use_async_buffer_write &&
  3930. m_vars[i].size >= __offload_use_async_buffer_write)) ?
  3931. &m_in_deps[m_in_deps_total++] : 0;
  3932. if (m_vars[i].flags.is_non_cont_struct ||
  3933. m_vars[i].flags.is_noncont_src ||
  3934. m_vars[i].flags.is_noncont_dst) {
  3935. send_noncontiguous_pointer_data(
  3936. i, src_data, ptr_data, event, sent_data,
  3937. in_deps_amount, in_deps);
  3938. }
  3939. else if (src_data && src_data->cpu_buf != 0) {
  3940. res = COI::BufferCopy(
  3941. ptr_data->mic_buf,
  3942. src_data->cpu_buf,
  3943. m_vars[i].offset + ptr_data->mic_offset +
  3944. m_vars[i].disp,
  3945. m_vars_extra[i].cpu_offset +
  3946. m_vars_extra[i].cpu_disp,
  3947. m_vars[i].size,
  3948. COI_COPY_UNSPECIFIED,
  3949. m_num_in_dependencies,
  3950. m_p_in_dependencies,
  3951. event);
  3952. if (res != COI_SUCCESS) {
  3953. if (m_status != 0) {
  3954. m_status->result = translate_coi_error(res);
  3955. return false;
  3956. }
  3957. report_coi_error(c_buf_copy, res);
  3958. }
  3959. }
  3960. else {
  3961. char *base = offload_get_src_base(m_vars[i].ptr,
  3962. m_vars_extra[i].type_src);
  3963. res = COI::BufferWrite(
  3964. ptr_data->mic_buf,
  3965. ptr_data->mic_offset +
  3966. m_vars[i].offset + m_vars[i].disp,
  3967. base + m_vars_extra[i].cpu_disp,
  3968. m_vars[i].size,
  3969. COI_COPY_UNSPECIFIED,
  3970. m_num_in_dependencies,
  3971. m_p_in_dependencies,
  3972. event);
  3973. if (res != COI_SUCCESS) {
  3974. if (m_status != 0) {
  3975. m_status->result = translate_coi_error(res);
  3976. return false;
  3977. }
  3978. report_coi_error(c_buf_write, res);
  3979. }
  3980. }
  3981. ptr_sent += sent_data;
  3982. }
  3983. break;
  3984. case c_dv_data_slice:
  3985. case c_dv_ptr_data_slice:
  3986. if (m_vars[i].direction.in &&
  3987. m_vars[i].size > 0) {
  3988. PtrData *dst_data = m_vars[i].into ?
  3989. m_vars_extra[i].dst_data :
  3990. m_vars_extra[i].src_data;
  3991. PtrData* src_data =
  3992. (VAR_TYPE_IS_PTR(m_vars_extra[i].type_src) ||
  3993. VAR_TYPE_IS_DV_DATA(m_vars_extra[i].type_src) ||
  3994. VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_src) ||
  3995. VAR_TYPE_IS_SCALAR(m_vars_extra[i].type_src) &&
  3996. m_vars[i].flags.is_static) ?
  3997. m_vars_extra[i].src_data : 0;
  3998. COIEVENT *event =
  3999. (m_stream != no_stream ||
  4000. is_async ||
  4001. (should_use_async_buffer_write &&
  4002. m_vars[i].size >= __offload_use_async_buffer_write)) ?
  4003. &m_in_deps[m_in_deps_total++] : 0;
  4004. if (m_vars[i].flags.is_non_cont_struct ||
  4005. m_vars[i].flags.is_noncont_src ||
  4006. m_vars[i].flags.is_noncont_dst) {
  4007. send_noncontiguous_pointer_data(
  4008. i, src_data, dst_data, event, sent_data,
  4009. in_deps_amount, in_deps);
  4010. }
  4011. else if (src_data && src_data->cpu_buf != 0) {
  4012. res = COI::BufferCopy(
  4013. dst_data->mic_buf,
  4014. src_data->cpu_buf,
  4015. m_vars[i].offset +
  4016. dst_data->mic_offset +
  4017. m_vars[i].disp,
  4018. m_vars_extra[i].cpu_offset +
  4019. m_vars_extra[i].cpu_disp,
  4020. m_vars[i].size,
  4021. COI_COPY_UNSPECIFIED,
  4022. m_num_in_dependencies,
  4023. m_p_in_dependencies,
  4024. event);
  4025. if (res != COI_SUCCESS) {
  4026. if (m_status != 0) {
  4027. m_status->result = translate_coi_error(res);
  4028. return false;
  4029. }
  4030. report_coi_error(c_buf_copy, res);
  4031. }
  4032. }
  4033. else {
  4034. char *base = offload_get_src_base(m_vars[i].ptr,
  4035. m_vars_extra[i].type_src);
  4036. res = COI::BufferWrite(
  4037. dst_data->mic_buf,
  4038. dst_data->mic_offset +
  4039. m_vars[i].offset + m_vars[i].disp,
  4040. base + m_vars_extra[i].cpu_disp,
  4041. m_vars[i].size,
  4042. COI_COPY_UNSPECIFIED,
  4043. m_num_in_dependencies,
  4044. m_p_in_dependencies,
  4045. event);
  4046. if (res != COI_SUCCESS) {
  4047. if (m_status != 0) {
  4048. m_status->result = translate_coi_error(res);
  4049. return false;
  4050. }
  4051. report_coi_error(c_buf_write, res);
  4052. }
  4053. }
  4054. ptr_sent += sent_data;
  4055. }
  4056. break;
  4057. default:
  4058. break;
  4059. }
  4060. if (m_vars_extra[i].omp_last_event_type == c_last_write) {
  4061. register_omp_event_call_back(&m_in_deps[m_in_deps_total - 1], info);
  4062. }
  4063. // alloc field isn't used at target.
  4064. // We can reuse it for offset of array pointers.
  4065. if (m_vars_extra[i].is_arr_ptr_el) {
  4066. m_vars[i].ptr_arr_offset = m_vars_extra[i].ptr_arr_offset;
  4067. }
  4068. }
  4069. // list of out events created while send_pointer_data now became input
  4070. // dependencies for runfunction (or Read transfers from target if
  4071. // runfunction is absent)
  4072. m_num_in_dependencies = m_in_deps_total ? m_in_deps_total :
  4073. m_num_in_dependencies;
  4074. m_p_in_dependencies = m_in_deps_total ? m_in_deps : m_p_in_dependencies;
  4075. if (m_status) {
  4076. m_status->data_sent += ptr_sent;
  4077. }
  4078. OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), ptr_sent);
  4079. OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()),
  4080. c_offload_sent_pointer_data,
  4081. "Total pointer data sent to target: [%lld] bytes\n",
  4082. ptr_sent);
  4083. return true;
  4084. }
  4085. bool OffloadDescriptor::gather_copyin_data()
  4086. {
  4087. OffloadTimer timer(get_timer_data(), c_offload_host_gather_inputs);
  4088. if (m_need_runfunction && m_in_datalen > 0) {
  4089. COIMAPINSTANCE map_inst;
  4090. char *data;
  4091. // init marshaller
  4092. if (m_inout_buf != 0) {
  4093. OffloadTimer timer_map(get_timer_data(),
  4094. c_offload_host_map_in_data_buffer);
  4095. COIRESULT res = COI::BufferMap(m_inout_buf, 0, m_in_datalen,
  4096. COI_MAP_WRITE_ENTIRE_BUFFER,
  4097. 0, 0, 0, &map_inst,
  4098. reinterpret_cast<void**>(&data));
  4099. if (res != COI_SUCCESS) {
  4100. if (m_status != 0) {
  4101. m_status->result = translate_coi_error(res);
  4102. return false;
  4103. }
  4104. report_coi_error(c_buf_map, res);
  4105. }
  4106. }
  4107. else {
  4108. data = (char*) m_func_desc + m_func_desc->data_offset;
  4109. }
  4110. // send variable descriptors
  4111. memcpy(data, m_vars, m_vars_total * sizeof(VarDesc));
  4112. data += m_vars_total * sizeof(VarDesc);
  4113. // init marshaller
  4114. m_in.init_buffer(data, m_in_datalen);
  4115. // Gather copy data into buffer
  4116. for (int i = 0; i < m_vars_total; i++) {
  4117. bool src_is_for_mic = (m_vars[i].direction.out ||
  4118. m_vars[i].into == NULL);
  4119. PtrData* ptr_data = src_is_for_mic ?
  4120. m_vars_extra[i].src_data :
  4121. m_vars_extra[i].dst_data;
  4122. if (m_vars[i].flags.alloc_disp) {
  4123. m_in.send_data(&ptr_data->alloc_disp,
  4124. sizeof(ptr_data->alloc_disp));
  4125. }
  4126. if (TYPE_IS_PTR_TO_PTR(m_vars_extra[i].type_src) ||
  4127. TYPE_IS_PTR_TO_PTR(m_vars_extra[i].type_dst) ||
  4128. (m_vars_extra[i].type_src == c_data_ptr_array &&
  4129. m_vars[i].flags.is_pointer)) {
  4130. m_in.send_data(&m_vars_extra[i].pointer_offset,
  4131. sizeof(m_vars_extra[i].pointer_offset));
  4132. }
  4133. // send sink address to the target
  4134. if (m_vars[i].flags.sink_addr) {
  4135. m_in.send_data(&ptr_data->mic_addr,
  4136. sizeof(ptr_data->mic_addr));
  4137. }
  4138. switch (m_vars_extra[i].type_dst) {
  4139. case c_data_ptr_array:
  4140. break;
  4141. case c_data:
  4142. case c_void_ptr:
  4143. case c_void_ptr_ptr:
  4144. case c_cean_var:
  4145. if (m_vars[i].direction.in &&
  4146. !m_vars[i].flags.is_static_dstn) {
  4147. char *ptr = offload_get_src_base(m_vars[i].ptr,
  4148. m_vars_extra[i].type_src);
  4149. if (m_vars_extra[i].type_dst == c_cean_var) {
  4150. // offset and length are derived from the array
  4151. // descriptor
  4152. int64_t size = m_vars[i].size;
  4153. int64_t disp = m_vars[i].disp;
  4154. m_in.send_data(reinterpret_cast<char*>(&size),
  4155. sizeof(int64_t));
  4156. m_in.send_data(reinterpret_cast<char*>(&disp),
  4157. sizeof(int64_t));
  4158. }
  4159. m_in.send_data(ptr + m_vars_extra[i].cpu_disp,
  4160. m_vars[i].size);
  4161. }
  4162. break;
  4163. case c_dv:
  4164. if (m_vars[i].direction.bits ||
  4165. m_vars[i].alloc_if ||
  4166. m_vars[i].free_if) {
  4167. // send dope vector excluding base
  4168. char *ptr = static_cast<char*>(m_vars[i].ptr);
  4169. m_in.send_data(ptr + sizeof(uint64_t),
  4170. m_vars[i].size - sizeof(uint64_t));
  4171. }
  4172. break;
  4173. case c_data_ptr:
  4174. // send to target addresses of obsolete
  4175. // stacks to be released
  4176. if (m_vars[i].flags.is_stack_buf &&
  4177. !m_vars[i].direction.bits &&
  4178. m_vars[i].alloc_if &&
  4179. m_vars[i].size != 0) {
  4180. for (PtrDataList::iterator it =
  4181. m_destroy_stack.begin();
  4182. it != m_destroy_stack.end(); it++) {
  4183. PtrData * ptr_data = *it;
  4184. m_in.send_data(&(ptr_data->mic_addr),
  4185. sizeof(ptr_data->mic_addr));
  4186. }
  4187. }
  4188. break;
  4189. case c_func_ptr:
  4190. case c_func_ptr_ptr:
  4191. if (m_vars[i].direction.in) {
  4192. m_in.send_func_ptr(*((const void**) m_vars[i].ptr));
  4193. }
  4194. break;
  4195. default:
  4196. break;
  4197. }
  4198. }
  4199. if (m_status) {
  4200. m_status->data_sent += m_in.get_tfr_size();
  4201. }
  4202. if (m_func_desc->data_offset == 0) {
  4203. OffloadTimer timer_unmap(get_timer_data(),
  4204. c_offload_host_unmap_in_data_buffer);
  4205. COIRESULT res = COI::BufferUnmap(map_inst, 0, 0, 0);
  4206. if (res != COI_SUCCESS) {
  4207. if (m_status != 0) {
  4208. m_status->result = translate_coi_error(res);
  4209. return false;
  4210. }
  4211. report_coi_error(c_buf_unmap, res);
  4212. }
  4213. }
  4214. }
  4215. OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), m_in.get_tfr_size());
  4216. OFFLOAD_DEBUG_TRACE_1(1,
  4217. GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_copyin_data,
  4218. "Total copyin data sent to target: [%lld] bytes\n",
  4219. m_in.get_tfr_size());
  4220. return true;
  4221. }
  4222. bool OffloadDescriptor::compute(void *info)
  4223. {
  4224. OffloadTimer timer(get_timer_data(), c_offload_host_start_compute);
  4225. if (m_need_runfunction) {
  4226. OFFLOAD_DEBUG_TRACE_1(2, GET_OFFLOAD_NUMBER(get_timer_data()),
  4227. c_offload_compute, "Compute task on MIC\n");
  4228. void* misc = m_func_desc;
  4229. int misc_len = m_func_desc_size;
  4230. void* ret = 0;
  4231. int ret_len = 0;
  4232. if (m_func_desc->data_offset != 0) {
  4233. misc_len += m_in_datalen;
  4234. if (m_out_datalen > 0) {
  4235. ret = (char*) m_func_desc + m_func_desc->data_offset;
  4236. ret_len = m_out_datalen;
  4237. }
  4238. }
  4239. // dispatch task
  4240. COIRESULT res;
  4241. COIEVENT event;
  4242. res = m_device.compute(m_stream,
  4243. m_compute_buffers,
  4244. misc, misc_len,
  4245. ret, ret_len,
  4246. m_num_in_dependencies,
  4247. m_p_in_dependencies,
  4248. &event);
  4249. if (res != COI_SUCCESS) {
  4250. if (m_status != 0) {
  4251. m_status->result = translate_coi_error(res);
  4252. return false;
  4253. }
  4254. report_coi_error(c_pipeline_run_func, res);
  4255. }
  4256. if (m_omp_async_last_event_type == c_last_runfunc) {
  4257. register_omp_event_call_back(&event, info);
  4258. }
  4259. m_in_deps_total = m_num_in_dependencies = 1;
  4260. m_in_deps[0] = event;
  4261. m_p_in_dependencies = m_in_deps;
  4262. }
  4263. return true;
  4264. }
  4265. // receive pointer data if source or destination or both of them are
  4266. // noncontiguous. There is guarantee that length of destination enough for
  4267. // transferred data.
  4268. bool OffloadDescriptor::receive_noncontiguous_pointer_data(
  4269. int i,
  4270. COIBUFFER dst_buf,
  4271. COIEVENT *event,
  4272. uint64_t &received_data,
  4273. uint32_t in_deps_amount,
  4274. COIEVENT *in_deps
  4275. )
  4276. {
  4277. NonContigDesc *desc;
  4278. int noncont_num;
  4279. int64_t offset_src, offset_dst;
  4280. int64_t length_src, length_dst;
  4281. int64_t length_src_cur, length_dst_cur;
  4282. int64_t receive_size;
  4283. COIRESULT res;
  4284. bool dst_is_empty = true;
  4285. bool src_is_empty = true;
  4286. char *base = offload_get_src_base(
  4287. m_vars[i].into ?
  4288. static_cast<char*>(m_vars[i].into) :
  4289. static_cast<char*>(m_vars[i].ptr),
  4290. m_vars_extra[i].type_dst);
  4291. received_data = 0;
  4292. // If BufferReadMultiD is defined we can set values of required arguments
  4293. // and transfer noncontiguous data via call to the COI routine.
  4294. if (__offload_use_coi_noncontiguous_transfer && COI::BufferReadMultiD) {
  4295. struct Arr_Desc* arr_desc_dst;
  4296. struct Arr_Desc* arr_desc_src;
  4297. int64_t size_src, size_dst;
  4298. offset_src = (m_vars_extra[i].read_rng_src)?
  4299. m_vars_extra[i].read_rng_src->init_offset : m_vars[i].disp;
  4300. size_src = m_vars_extra[i].read_rng_src ?
  4301. cean_get_transf_size(m_vars_extra[i].read_rng_src) :
  4302. m_vars[i].size;
  4303. offset_dst = (m_vars_extra[i].read_rng_dst)?
  4304. m_vars_extra[i].read_rng_dst->init_offset : m_vars_extra[i].cpu_disp;
  4305. size_dst = m_vars_extra[i].read_rng_dst ?
  4306. cean_get_transf_size(m_vars_extra[i].read_rng_dst) : m_vars[i].size;
  4307. int64_t el_size = (!m_vars[i].into ||
  4308. (m_vars_extra[i].read_rng_src &&
  4309. m_vars_extra[i].read_rng_dst)) ?
  4310. 1 :
  4311. m_vars_extra[i].read_rng_src ?
  4312. m_vars_extra[i].read_rng_src->arr_desc->dim[
  4313. m_vars_extra[i].read_rng_src->arr_desc->rank - 1].size :
  4314. m_vars_extra[i].read_rng_dst->arr_desc->dim[
  4315. m_vars_extra[i].read_rng_dst->arr_desc->rank - 1].size;
  4316. arr_desc_src = (m_vars_extra[i].read_rng_src) ?
  4317. m_vars_extra[i].read_rng_src->arr_desc :
  4318. make_arr_desc(NULL, // don't required for source
  4319. offset_src/el_size, size_src/el_size,
  4320. el_size);
  4321. arr_desc_dst = !m_vars[i].into ? arr_desc_src :
  4322. (m_vars_extra[i].read_rng_dst) ?
  4323. m_vars_extra[i].read_rng_dst->arr_desc :
  4324. make_arr_desc(NULL,
  4325. offset_dst/el_size, size_src/el_size, el_size);
  4326. arr_desc_dst->base = reinterpret_cast<int64_t>(base);
  4327. res = COI::BufferReadMultiD(
  4328. m_vars_extra[i].src_data->mic_buf, // SourceBuffer
  4329. m_vars[i].offset + m_vars[i].mic_offset -
  4330. m_vars_extra[i].src_data->alloc_disp, // Offset
  4331. (void*)arr_desc_dst, // descriptor of DestArray
  4332. (void*)arr_desc_src, // descriptor of SrcArray
  4333. COI_COPY_UNSPECIFIED, // Type
  4334. m_num_in_dependencies, // Number of in Dependencies
  4335. m_p_in_dependencies, // array of in Dependencies
  4336. event); // out Dependency
  4337. if (res != COI_SUCCESS) {
  4338. if (m_status != 0) {
  4339. m_status->result = translate_coi_error(res);
  4340. return false;
  4341. }
  4342. report_coi_error(c_buf_copy, res);
  4343. }
  4344. return(true);
  4345. }
  4346. if (m_vars[i].flags.is_non_cont_struct) {
  4347. desc = m_vars_extra[i].noncont_desc;
  4348. noncont_num = 0;
  4349. }
  4350. else {
  4351. // Set length_src and length_dst
  4352. length_src = (m_vars_extra[i].read_rng_src) ?
  4353. m_vars_extra[i].read_rng_src->range_size : m_vars[i].size;
  4354. length_dst = !m_vars[i].into ? length_src :
  4355. (m_vars_extra[i].read_rng_dst) ?
  4356. m_vars_extra[i].read_rng_dst->range_size : m_vars[i].size;
  4357. receive_size = (length_src < length_dst) ? length_src : length_dst;
  4358. }
  4359. // if event is defined we must multiplate for all contiguous intervals
  4360. // that will be Copied/Read.
  4361. // Take in account that we already have 1 event.
  4362. if (event) {
  4363. uint32_t range_num = m_vars[i].flags.is_non_cont_struct ?
  4364. desc->interval_cnt :
  4365. (length_src / receive_size) *
  4366. ((m_vars_extra[i].read_rng_src) ?
  4367. m_vars_extra[i].read_rng_src->range_max_number : 1) ;
  4368. m_out_deps_allocated += range_num;
  4369. m_out_deps =
  4370. (COIEVENT*)realloc(m_out_deps, sizeof(COIEVENT) * m_out_deps_allocated);
  4371. m_out_deps_total--;
  4372. }
  4373. // consequently get contiguous ranges,
  4374. // define corresponded destination offset and receive data
  4375. do {
  4376. if (m_vars[i].flags.is_non_cont_struct) {
  4377. // ranges are over
  4378. if (noncont_num >= desc->interval_cnt) {
  4379. break;
  4380. }
  4381. offset_src = offset_dst = desc->interval[noncont_num].lower;
  4382. receive_size = desc->interval[noncont_num].size;
  4383. noncont_num++;
  4384. }
  4385. else { // get source offset
  4386. if (src_is_empty) {
  4387. if (m_vars_extra[i].read_rng_src) {
  4388. if (!get_next_range(m_vars_extra[i].read_rng_src,
  4389. &offset_src)) {
  4390. // source ranges are over - nothing to send
  4391. break;
  4392. }
  4393. }
  4394. else if (received_data == 0) {
  4395. offset_src = m_vars[i].disp;
  4396. }
  4397. else {
  4398. break;
  4399. }
  4400. length_src_cur = length_src;
  4401. }
  4402. else {
  4403. // if source is contiguous or its contiguous range is greater
  4404. // than destination one
  4405. offset_src += receive_size;
  4406. }
  4407. length_src_cur -= receive_size;
  4408. src_is_empty = length_src_cur == 0;
  4409. // get destination offset
  4410. if (dst_is_empty) {
  4411. if (m_vars[i].into) {
  4412. if (m_vars_extra[i].read_rng_dst) {
  4413. if (!get_next_range(m_vars_extra[i].read_rng_dst,
  4414. &offset_dst)) {
  4415. // destination ranges are over
  4416. LIBOFFLOAD_ERROR(c_destination_is_over);
  4417. return false;
  4418. }
  4419. }
  4420. // destination is contiguous.
  4421. else {
  4422. offset_dst = m_vars_extra[i].cpu_disp;
  4423. }
  4424. length_dst_cur = length_dst;
  4425. }
  4426. // same as source
  4427. else {
  4428. offset_dst = offset_src;
  4429. length_dst_cur = length_src;
  4430. }
  4431. }
  4432. else {
  4433. // if destination is contiguous or its contiguous range is greater
  4434. // than source one
  4435. offset_dst += receive_size;
  4436. }
  4437. length_dst_cur -= receive_size;
  4438. dst_is_empty = length_dst_cur == 0;
  4439. }
  4440. if (event) {
  4441. event = &m_out_deps[m_out_deps_total++];
  4442. }
  4443. if (dst_buf != 0) {
  4444. res = COI::BufferCopy(
  4445. dst_buf,
  4446. m_vars_extra[i].src_data->mic_buf,
  4447. m_vars_extra[i].cpu_offset + offset_dst,
  4448. m_vars[i].offset + offset_src +
  4449. m_vars[i].mic_offset,
  4450. receive_size,
  4451. COI_COPY_UNSPECIFIED,
  4452. m_num_in_dependencies,
  4453. m_p_in_dependencies,
  4454. event);
  4455. if (res != COI_SUCCESS) {
  4456. if (m_status != 0) {
  4457. m_status->result = translate_coi_error(res);
  4458. return false;
  4459. }
  4460. report_coi_error(c_buf_copy, res);
  4461. }
  4462. }
  4463. else {
  4464. res = COI::BufferRead(
  4465. m_vars_extra[i].src_data->mic_buf,
  4466. m_vars[i].offset + offset_src +
  4467. m_vars[i].mic_offset,
  4468. base + offset_dst,
  4469. receive_size,
  4470. COI_COPY_UNSPECIFIED,
  4471. m_num_in_dependencies,
  4472. m_p_in_dependencies,
  4473. event);
  4474. if (res != COI_SUCCESS) {
  4475. if (m_status != 0) {
  4476. m_status->result = translate_coi_error(res);
  4477. return false;
  4478. }
  4479. report_coi_error(c_buf_read, res);
  4480. }
  4481. }
  4482. received_data += receive_size;
  4483. }
  4484. while (true);
  4485. return true;
  4486. }
  4487. bool OffloadDescriptor::receive_pointer_data(bool is_async,
  4488. bool first_run, void *info)
  4489. {
  4490. OffloadTimer timer(get_timer_data(), c_offload_host_start_buffers_reads);
  4491. bool should_use_async_buffer_read = m_initial_need_runfunction;
  4492. uint64_t ptr_received = 0;
  4493. COIRESULT res;
  4494. // For offload_transfer and offload with empty body without signal:
  4495. // - if there is only one buffer copy - get data synchronously
  4496. // - if there are multiple buffer copy and
  4497. // __offload_parallel_copy is false - get data synchronously
  4498. // - if there are multiple buffer copy
  4499. // and __offload_parallel_copy is true - get data asynchronously
  4500. // It concerns only data with size greater than __offload_use_async_buffer_read.
  4501. // Data of size less than __offload_use_async_buffer_read are received synchronously.
  4502. // Synchronous transfer results in better performance in COI.
  4503. // __offload_parallel_copy is false by default but can be changed
  4504. // via environment variable OFFLOAD_PARALLEL_COPY
  4505. if (!m_initial_need_runfunction && __offload_parallel_copy) {
  4506. int big_size_count = 0;
  4507. for (int i = 0; i < m_vars_total; i++) {
  4508. if (m_vars[i].direction.out &&
  4509. m_vars[i].size >= __offload_use_async_buffer_read) {
  4510. // preallocated OUT only at second run
  4511. if (first_run == m_vars[i].flags.preallocated) {
  4512. continue;
  4513. }
  4514. switch (m_vars_extra[i].type_src) {
  4515. case c_data:
  4516. case c_void_ptr:
  4517. case c_void_ptr_ptr:
  4518. case c_cean_var:
  4519. if (m_vars[i].flags.is_static) {
  4520. big_size_count++;
  4521. }
  4522. break;
  4523. case c_string_ptr:
  4524. case c_data_ptr:
  4525. case c_string_ptr_ptr:
  4526. case c_data_ptr_ptr:
  4527. case c_cean_var_ptr:
  4528. case c_cean_var_ptr_ptr:
  4529. case c_dv_data:
  4530. case c_dv_ptr_data:
  4531. case c_dv_data_slice:
  4532. case c_dv_ptr_data_slice:
  4533. case c_dv_ptr:
  4534. big_size_count++;
  4535. break;
  4536. default:
  4537. break;
  4538. }
  4539. }
  4540. }
  4541. if (big_size_count > 1) {
  4542. should_use_async_buffer_read = true;
  4543. }
  4544. }
  4545. uint32_t in_deps_amount = m_in_deps_total;
  4546. COIEVENT *in_deps = m_in_deps_total > 0 ? m_in_deps : 0;
  4547. for (int i = 0; i < m_vars_total; i++) {
  4548. uint64_t received_data = m_vars[i].size;
  4549. // Nothing to receive if use_device_ptr
  4550. if (m_vars[i].flags.use_device_ptr )
  4551. continue;
  4552. if (m_vars_extra[i].omp_last_event_type == c_last_read &&
  4553. m_out_deps_total > 0) {
  4554. m_num_in_dependencies = m_out_deps_total;
  4555. m_p_in_dependencies = m_out_deps;
  4556. }
  4557. // At first run don't receive by preallocated target pointer as the
  4558. //pointer value will be ready later after call to scatter_copyout_data
  4559. if (first_run && m_vars[i].alloc_if && m_vars[i].flags.preallocated) {
  4560. m_preallocated_alloc = true;
  4561. // need one more call to OffloadDescriptor::receive_pointer_data
  4562. if (m_vars[i].direction.out) {
  4563. m_out_with_preallocated = true;
  4564. }
  4565. continue;
  4566. }
  4567. switch (m_vars_extra[i].type_src) {
  4568. case c_data_ptr_array:
  4569. break;
  4570. case c_data:
  4571. case c_void_ptr:
  4572. case c_void_ptr_ptr:
  4573. case c_cean_var:
  4574. if (m_vars[i].direction.out &&
  4575. m_vars[i].flags.is_static) {
  4576. COIEVENT *event =
  4577. (m_stream != no_stream ||
  4578. is_async ||
  4579. m_in_deps_total > 0 ||
  4580. (should_use_async_buffer_read &&
  4581. m_vars[i].size >= __offload_use_async_buffer_read)) ?
  4582. &m_out_deps[m_out_deps_total++] : 0;
  4583. PtrData *ptr_data = NULL;
  4584. COIBUFFER dst_buf = NULL; // buffer at host
  4585. char *base;
  4586. if (VAR_TYPE_IS_PTR(m_vars_extra[i].type_dst)) {
  4587. ptr_data = m_vars[i].into ?
  4588. m_vars_extra[i].dst_data :
  4589. m_vars_extra[i].src_data;
  4590. }
  4591. else if (VAR_TYPE_IS_SCALAR(m_vars_extra[i].type_dst)) {
  4592. if (m_vars[i].flags.is_static_dstn) {
  4593. ptr_data = m_vars[i].into ?
  4594. m_vars_extra[i].dst_data :
  4595. m_vars_extra[i].src_data;
  4596. }
  4597. }
  4598. dst_buf = ptr_data ? ptr_data->cpu_buf : NULL;
  4599. if (dst_buf == NULL) {
  4600. base = offload_get_src_base(
  4601. m_vars[i].into ?
  4602. static_cast<char*>(m_vars[i].into) :
  4603. static_cast<char*>(m_vars[i].ptr),
  4604. m_vars_extra[i].type_dst);
  4605. }
  4606. if (m_vars[i].flags.is_non_cont_struct ||
  4607. m_vars[i].flags.is_noncont_src ||
  4608. m_vars[i].flags.is_noncont_dst) {
  4609. receive_noncontiguous_pointer_data(
  4610. i, dst_buf, event, received_data,
  4611. m_num_in_dependencies, m_p_in_dependencies);
  4612. }
  4613. else if (dst_buf != 0) {
  4614. res = COI::BufferCopy(
  4615. dst_buf,
  4616. m_vars_extra[i].src_data->mic_buf,
  4617. m_vars_extra[i].cpu_offset +
  4618. m_vars_extra[i].cpu_disp,
  4619. m_vars[i].offset + m_vars[i].disp,
  4620. m_vars[i].size,
  4621. COI_COPY_UNSPECIFIED,
  4622. m_num_in_dependencies,
  4623. m_p_in_dependencies,
  4624. event);
  4625. if (res != COI_SUCCESS) {
  4626. if (m_status != 0) {
  4627. m_status->result = translate_coi_error(res);
  4628. return false;
  4629. }
  4630. report_coi_error(c_buf_copy, res);
  4631. }
  4632. }
  4633. else {
  4634. res = COI::BufferRead(
  4635. m_vars_extra[i].src_data->mic_buf,
  4636. m_vars[i].offset + m_vars[i].disp,
  4637. base + m_vars_extra[i].cpu_offset +
  4638. m_vars_extra[i].cpu_disp,
  4639. m_vars[i].size,
  4640. COI_COPY_UNSPECIFIED,
  4641. m_num_in_dependencies,
  4642. m_p_in_dependencies,
  4643. event);
  4644. if (res != COI_SUCCESS) {
  4645. if (m_status != 0) {
  4646. m_status->result = translate_coi_error(res);
  4647. return false;
  4648. }
  4649. report_coi_error(c_buf_read, res);
  4650. }
  4651. }
  4652. ptr_received += received_data;
  4653. }
  4654. break;
  4655. case c_string_ptr:
  4656. case c_data_ptr:
  4657. case c_string_ptr_ptr:
  4658. case c_data_ptr_ptr:
  4659. case c_cean_var_ptr:
  4660. case c_cean_var_ptr_ptr:
  4661. case c_dv_data:
  4662. case c_dv_ptr_data:
  4663. case c_dv_data_slice:
  4664. case c_dv_ptr_data_slice:
  4665. case c_dv_ptr: {
  4666. COIBUFFER dst_buf = NULL; // buffer on host
  4667. if (m_vars[i].direction.out && m_vars[i].size > 0) {
  4668. COIEVENT *event =
  4669. (m_stream != no_stream ||
  4670. is_async ||
  4671. m_in_deps_total > 0 ||
  4672. (should_use_async_buffer_read &&
  4673. m_vars[i].size >= __offload_use_async_buffer_read)) ?
  4674. &m_out_deps[m_out_deps_total++] : 0;
  4675. uint64_t dst_offset = 0;
  4676. char *base = static_cast<char*>(m_vars[i].ptr);
  4677. if (VAR_TYPE_IS_PTR(m_vars_extra[i].type_dst)) {
  4678. PtrData *ptr_data = m_vars[i].into ?
  4679. m_vars_extra[i].dst_data :
  4680. m_vars_extra[i].src_data;
  4681. dst_buf = ptr_data ? ptr_data->cpu_buf : NULL;
  4682. if (dst_buf == NULL) {
  4683. base = m_vars[i].into ?
  4684. *static_cast<char**>(m_vars[i].into) :
  4685. *static_cast<char**>(m_vars[i].ptr);
  4686. }
  4687. dst_offset = m_vars_extra[i].cpu_offset +
  4688. m_vars_extra[i].cpu_disp;
  4689. }
  4690. else if (VAR_TYPE_IS_SCALAR(m_vars_extra[i].type_dst)) {
  4691. if (m_vars[i].flags.is_static_dstn) {
  4692. dst_buf = m_vars[i].into ?
  4693. m_vars_extra[i].dst_data->cpu_buf :
  4694. m_vars_extra[i].src_data->cpu_buf;
  4695. }
  4696. if (dst_buf == NULL) {
  4697. base = offload_get_src_base(
  4698. m_vars[i].into ?
  4699. static_cast<char*>(m_vars[i].into) :
  4700. static_cast<char*>(m_vars[i].ptr),
  4701. m_vars_extra[i].type_dst);
  4702. }
  4703. dst_offset = m_vars_extra[i].cpu_offset +
  4704. m_vars_extra[i].cpu_disp;
  4705. }
  4706. else if (VAR_TYPE_IS_DV_DATA(m_vars_extra[i].type_dst) ||
  4707. VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_dst)) {
  4708. PtrData *ptr_data = m_vars[i].into != 0 ?
  4709. m_vars_extra[i].dst_data :
  4710. m_vars_extra[i].src_data;
  4711. dst_buf = ptr_data != 0 ? ptr_data->cpu_buf : 0;
  4712. if (dst_buf == NULL) {
  4713. base = offload_get_src_base(
  4714. m_vars[i].into ?
  4715. static_cast<char*>(m_vars[i].into) :
  4716. static_cast<char*>(m_vars[i].ptr),
  4717. m_vars_extra[i].type_dst);
  4718. }
  4719. dst_offset = m_vars_extra[i].cpu_offset +
  4720. m_vars_extra[i].cpu_disp;
  4721. }
  4722. if (m_vars[i].flags.is_non_cont_struct ||
  4723. m_vars[i].flags.is_noncont_src ||
  4724. m_vars[i].flags.is_noncont_dst) {
  4725. receive_noncontiguous_pointer_data(
  4726. i, dst_buf, event, received_data,
  4727. m_num_in_dependencies, m_p_in_dependencies);
  4728. }
  4729. else if (dst_buf != 0) {
  4730. res = COI::BufferCopy(
  4731. dst_buf,
  4732. m_vars_extra[i].src_data->mic_buf,
  4733. dst_offset,
  4734. m_vars[i].offset + m_vars[i].disp +
  4735. m_vars[i].mic_offset,
  4736. m_vars[i].size,
  4737. COI_COPY_UNSPECIFIED,
  4738. m_num_in_dependencies,
  4739. m_p_in_dependencies,
  4740. event);
  4741. if (res != COI_SUCCESS) {
  4742. if (m_status != 0) {
  4743. m_status->result = translate_coi_error(res);
  4744. return false;
  4745. }
  4746. report_coi_error(c_buf_copy, res);
  4747. }
  4748. }
  4749. else {
  4750. res = COI::BufferRead(
  4751. m_vars_extra[i].src_data->mic_buf,
  4752. m_vars[i].offset + m_vars[i].disp +
  4753. m_vars[i].mic_offset,
  4754. base + dst_offset,
  4755. m_vars[i].size,
  4756. COI_COPY_UNSPECIFIED,
  4757. m_num_in_dependencies,
  4758. m_p_in_dependencies,
  4759. event);
  4760. if (res != COI_SUCCESS) {
  4761. if (m_status != 0) {
  4762. m_status->result = translate_coi_error(res);
  4763. return false;
  4764. }
  4765. report_coi_error(c_buf_read, res);
  4766. }
  4767. }
  4768. ptr_received += received_data;
  4769. }
  4770. break;
  4771. }
  4772. default:
  4773. break;
  4774. }
  4775. if (m_vars_extra[i].omp_last_event_type == c_last_read) {
  4776. register_omp_event_call_back(&m_out_deps[m_out_deps_total - 1], info);
  4777. }
  4778. // destroy buffers for obsolete stacks
  4779. if (m_destroy_stack.size() != 0) {
  4780. for (PtrDataList::iterator it = m_destroy_stack.begin();
  4781. it != m_destroy_stack.end(); it++) {
  4782. PtrData *ptr_data = *it;
  4783. m_destroy_buffers.push_back(ptr_data->mic_buf);
  4784. OFFLOAD_TRACE(3, "Removing stack buffer with addr %p\n",
  4785. ptr_data->mic_addr);
  4786. }
  4787. m_destroy_stack.clear();
  4788. }
  4789. if (m_vars[i].free_if) {
  4790. // remove association for automatic variables
  4791. if (m_is_openmp) {
  4792. if (m_vars_extra[i].auto_data) {
  4793. AutoData *auto_data = m_vars_extra[i].auto_data;
  4794. if (m_vars[i].flags.always_delete) {
  4795. auto_data->nullify_reference();
  4796. }
  4797. else if (auto_data->remove_reference() == 0) {
  4798. m_device.remove_auto_data(auto_data->cpu_addr.start());
  4799. }
  4800. continue;
  4801. }
  4802. else {
  4803. PtrData *ptr_data = m_vars_extra[i].src_data;
  4804. if (ptr_data &&
  4805. IS_OPENMP_IMPLICIT_OR_LINK(ptr_data->var_alloc_type)) {
  4806. if (ptr_data->get_reference() > 0) {
  4807. ptr_data->remove_reference();
  4808. }
  4809. continue;
  4810. }
  4811. }
  4812. }
  4813. // destroy buffers
  4814. if (m_vars[i].direction.out || m_vars[i].into == NULL) {
  4815. if (!VAR_TYPE_IS_PTR(m_vars_extra[i].type_src) &&
  4816. !VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_src) &&
  4817. !VAR_TYPE_IS_DV_DATA(m_vars_extra[i].type_src)) {
  4818. continue;
  4819. }
  4820. PtrData *ptr_data = m_vars_extra[i].src_data;
  4821. if (ptr_data->remove_reference() == 0) {
  4822. // destroy buffers
  4823. if (ptr_data->cpu_buf != 0) {
  4824. m_destroy_buffers.push_back(ptr_data->cpu_buf);
  4825. }
  4826. if (ptr_data->mic_buf != 0) {
  4827. m_destroy_buffers.push_back(ptr_data->mic_buf);
  4828. }
  4829. OFFLOAD_TRACE(3, "Removing association for addr %p\n",
  4830. ptr_data->cpu_addr.start());
  4831. // remove association from map
  4832. if (m_vars[i].flags.targetptr) {
  4833. m_device.remove_targetptr_data(ptr_data->cpu_addr.start());
  4834. }
  4835. else {
  4836. m_device.remove_ptr_data(ptr_data->cpu_addr.start());
  4837. }
  4838. }
  4839. }
  4840. else if (VAR_TYPE_IS_PTR(m_vars_extra[i].type_dst) ||
  4841. VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_dst) ||
  4842. VAR_TYPE_IS_DV_DATA(m_vars_extra[i].type_dst)) {
  4843. PtrData *ptr_data = m_vars_extra[i].dst_data;
  4844. if (ptr_data->remove_reference() == 0) {
  4845. // destroy buffers
  4846. if (ptr_data->cpu_buf != 0) {
  4847. m_destroy_buffers.push_back(ptr_data->cpu_buf);
  4848. }
  4849. if (ptr_data->mic_buf != 0) {
  4850. m_destroy_buffers.push_back(ptr_data->mic_buf);
  4851. }
  4852. OFFLOAD_TRACE(3, "Removing association for addr %p\n",
  4853. ptr_data->cpu_addr.start());
  4854. // remove association from map
  4855. if (m_vars[i].flags.targetptr) {
  4856. m_device.remove_targetptr_data(ptr_data->cpu_addr.start());
  4857. }
  4858. else {
  4859. m_device.remove_ptr_data(ptr_data->cpu_addr.start());
  4860. }
  4861. }
  4862. }
  4863. }
  4864. }
  4865. if (m_status) {
  4866. m_status->data_received += ptr_received;
  4867. }
  4868. m_num_in_dependencies = m_out_deps_total ? m_out_deps_total :
  4869. m_num_in_dependencies;
  4870. m_p_in_dependencies = m_out_deps_total ? m_out_deps : m_p_in_dependencies;
  4871. OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), ptr_received);
  4872. OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()),
  4873. c_offload_received_pointer_data,
  4874. "Total pointer data received from target: [%lld] bytes\n",
  4875. ptr_received);
  4876. return true;
  4877. }
  4878. bool OffloadDescriptor::scatter_copyout_data()
  4879. {
  4880. OffloadTimer timer(get_timer_data(), c_offload_host_scatter_outputs);
  4881. if (m_need_runfunction && m_out_datalen > 0) {
  4882. // total size that need to be transferred from target to host
  4883. COIMAPINSTANCE map_inst;
  4884. COIRESULT res;
  4885. char *data;
  4886. // output data buffer
  4887. if (m_func_desc->data_offset == 0) {
  4888. OffloadTimer timer_map(get_timer_data(),
  4889. c_offload_host_map_out_data_buffer);
  4890. COIRESULT res = COI::BufferMap(m_inout_buf, 0, m_out_datalen,
  4891. COI_MAP_READ_ONLY, 0, 0, 0,
  4892. &map_inst,
  4893. reinterpret_cast<void**>(&data));
  4894. if (res != COI_SUCCESS) {
  4895. if (m_status != 0) {
  4896. m_status->result = translate_coi_error(res);
  4897. return false;
  4898. }
  4899. report_coi_error(c_buf_map, res);
  4900. }
  4901. }
  4902. else {
  4903. data = (char*) m_func_desc + m_func_desc->data_offset;
  4904. }
  4905. // get timing data
  4906. OFFLOAD_TIMER_TARGET_DATA(get_timer_data(), data);
  4907. data += OFFLOAD_TIMER_DATALEN();
  4908. // initialize output marshaller
  4909. m_out.init_buffer(data, m_out_datalen);
  4910. for (int i = 0; i < m_vars_total; i++) {
  4911. bool src_is_for_mic = (m_vars[i].direction.out ||
  4912. m_vars[i].into == NULL);
  4913. if (m_vars_extra[i].type_src != c_data_ptr_array &&
  4914. m_vars[i].flags.preallocated && m_vars[i].alloc_if) {
  4915. PtrData *ptr_data;
  4916. void *ptr_value;
  4917. void ** cpu_ptr = src_is_for_mic ?
  4918. reinterpret_cast<void**>(m_vars[i].ptr) :
  4919. reinterpret_cast<void**>(m_vars[i].into);
  4920. void* alloc_base = NULL;
  4921. int64_t alloc_disp = 0;
  4922. int64_t alloc_size;
  4923. if (m_vars_extra[i].alloc != NULL) {
  4924. // array descriptor
  4925. const Arr_Desc *ap =
  4926. static_cast<const Arr_Desc*>(m_vars_extra[i].alloc);
  4927. __arr_data_offset_and_length(ap, alloc_disp, alloc_size);
  4928. alloc_base = reinterpret_cast<void*>(ap->base);
  4929. }
  4930. // get pointer to target memory
  4931. m_out.receive_data(&ptr_value, sizeof(void*));
  4932. // add new entry
  4933. if (!alloc_ptr_data(
  4934. ptr_data,
  4935. ptr_value,
  4936. (alloc_base != NULL) ?
  4937. alloc_disp : m_vars[i].disp,
  4938. (alloc_base != NULL) ?
  4939. alloc_size : m_vars[i].size,
  4940. alloc_disp,
  4941. 0,
  4942. m_vars[i].flags.targetptr,
  4943. m_vars[i].flags.preallocated,
  4944. m_vars[i].flags.pin)) {
  4945. return false;
  4946. }
  4947. ptr_data->add_reference();
  4948. *cpu_ptr = ptr_value;
  4949. if (src_is_for_mic) {
  4950. m_vars_extra[i].src_data = ptr_data;
  4951. }
  4952. else {
  4953. m_vars_extra[i].dst_data = ptr_data;
  4954. }
  4955. m_vars[i].offset = (char*) ptr_value -
  4956. (char*) ptr_data->cpu_addr.start();
  4957. }
  4958. switch (m_vars_extra[i].type_src) {
  4959. case c_data_ptr_array:
  4960. break;
  4961. case c_data:
  4962. case c_void_ptr:
  4963. case c_void_ptr_ptr:
  4964. case c_cean_var:
  4965. if (m_vars[i].direction.out &&
  4966. !m_vars[i].flags.is_static) {
  4967. if (m_vars[i].into) {
  4968. char *ptr = offload_get_src_base(
  4969. static_cast<char*>(m_vars[i].into),
  4970. m_vars_extra[i].type_dst);
  4971. m_out.receive_data(ptr + m_vars_extra[i].cpu_disp,
  4972. m_vars[i].size);
  4973. }
  4974. else {
  4975. m_out.receive_data(
  4976. static_cast<char*>(m_vars[i].ptr) +
  4977. m_vars_extra[i].cpu_disp,
  4978. m_vars[i].size);
  4979. }
  4980. }
  4981. break;
  4982. case c_func_ptr:
  4983. case c_func_ptr_ptr:
  4984. if (m_vars[i].direction.out) {
  4985. m_out.receive_func_ptr((const void**) m_vars[i].ptr);
  4986. }
  4987. break;
  4988. default:
  4989. break;
  4990. }
  4991. }
  4992. if (m_status) {
  4993. m_status->data_received += m_out.get_tfr_size();
  4994. }
  4995. if (m_func_desc->data_offset == 0) {
  4996. OffloadTimer timer_unmap(get_timer_data(),
  4997. c_offload_host_unmap_out_data_buffer);
  4998. COIRESULT res = COI::BufferUnmap(map_inst, 0, 0, 0);
  4999. if (res != COI_SUCCESS) {
  5000. if (m_status != 0) {
  5001. m_status->result = translate_coi_error(res);
  5002. return false;
  5003. }
  5004. report_coi_error(c_buf_unmap, res);
  5005. }
  5006. }
  5007. }
  5008. OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), m_out.get_tfr_size());
  5009. OFFLOAD_TRACE(1, "Total copyout data received from target: [%lld] bytes\n",
  5010. m_out.get_tfr_size());
  5011. return true;
  5012. }
  5013. static void get_arr_desc_numbers(
  5014. const Arr_Desc *ap,
  5015. int64_t el_size,
  5016. int64_t &offset,
  5017. int64_t &size,
  5018. int &el_number,
  5019. CeanReadRanges* &ptr_ranges
  5020. )
  5021. {
  5022. if (is_arr_desc_contiguous(ap)) {
  5023. ptr_ranges = NULL;
  5024. __arr_data_offset_and_length(ap, offset, size);
  5025. el_number = size / el_size;
  5026. }
  5027. else {
  5028. ptr_ranges = init_read_ranges_arr_desc(ap);
  5029. el_number = (ptr_ranges->range_size / el_size) *
  5030. ptr_ranges->range_max_number;
  5031. size = ptr_ranges->range_size;
  5032. }
  5033. }
  5034. bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
  5035. {
  5036. int pointers_number;
  5037. int tmp_val;
  5038. int new_index = m_vars_total;
  5039. const Arr_Desc *ap;
  5040. const VarDesc3 *vd3 = static_cast<const VarDesc3*>(m_vars[i].ptr);
  5041. int flags = vd3->array_fields;
  5042. bool src_is_for_mic = (m_vars[i].direction.out ||
  5043. m_vars[i].into == NULL);
  5044. ReadArrElements<void *> ptr;
  5045. ReadArrElements<void *> into;
  5046. ReadArrElements<int64_t> ext_start;
  5047. ReadArrElements<int64_t> ext_elements;
  5048. ReadArrElements<int64_t> align;
  5049. ReadArrElements<int64_t> alloc_if;
  5050. ReadArrElements<int64_t> free_if;
  5051. ReadArrElements<int64_t> into_start;
  5052. ReadArrElements<int64_t> into_elem;
  5053. ReadArrElements<int64_t> alloc_start;
  5054. ReadArrElements<int64_t> alloc_elem;
  5055. ap = static_cast<const Arr_Desc*>(vd3->ptr_array);
  5056. // "pointers_number" for total number of transferred pointers.
  5057. // For each of them we create new var_desc and put it at the bottom
  5058. // of the var_desc's array
  5059. get_arr_desc_numbers(ap, sizeof(void *), ptr.offset, ptr.size,
  5060. pointers_number, ptr.ranges);
  5061. ptr.base = reinterpret_cast<char*>(ap->base);
  5062. // 2. prepare memory for new var_descs
  5063. m_vars_total += pointers_number;
  5064. m_vars = (VarDesc*)realloc(m_vars, m_vars_total * sizeof(VarDesc));
  5065. if (m_vars == NULL)
  5066. LIBOFFLOAD_ERROR(c_malloc);
  5067. m_vars_extra =
  5068. (VarExtra*)realloc(m_vars_extra, m_vars_total * sizeof(VarExtra));
  5069. if (m_vars_extra == NULL)
  5070. LIBOFFLOAD_ERROR(c_malloc);
  5071. m_in_deps =
  5072. (COIEVENT*)realloc(m_in_deps, sizeof(COIEVENT) * (m_vars_total + 1));
  5073. if (m_in_deps == NULL)
  5074. LIBOFFLOAD_ERROR(c_malloc);
  5075. m_out_deps =
  5076. (COIEVENT*)realloc(m_out_deps, sizeof(COIEVENT) * m_vars_total);
  5077. if (m_out_deps == NULL)
  5078. LIBOFFLOAD_ERROR(c_malloc);
  5079. // 3. Prepare for reading new var_desc's fields
  5080. // EXTENT START
  5081. if ((flags & (1<<flag_extent_start_is_array)) != 0) {
  5082. ap = static_cast<const Arr_Desc*>(vd3->extent_start);
  5083. get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, ext_start.offset,
  5084. ext_start.size, tmp_val, ext_start.ranges);
  5085. ext_start.base = reinterpret_cast<char*>(ap->base);
  5086. ext_start.el_size = ap->dim[ap->rank - 1].size;
  5087. if (tmp_val < pointers_number) {
  5088. LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent start");
  5089. return false;
  5090. }
  5091. }
  5092. else if ((flags & (1<<flag_extent_start_is_scalar)) != 0) {
  5093. ext_start.val = (int64_t)vd3->extent_start;
  5094. }
  5095. else {
  5096. ext_start.val = 0;
  5097. }
  5098. // EXTENT ELEMENTS NUMBER
  5099. if ((flags & (1<<flag_extent_elements_is_array)) != 0) {
  5100. ap = static_cast<const Arr_Desc*>(vd3->extent_elements);
  5101. get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size,
  5102. ext_elements.offset, ext_elements.size,
  5103. tmp_val, ext_elements.ranges);
  5104. ext_elements.base = reinterpret_cast<char*>(ap->base);
  5105. ext_elements.el_size = ap->dim[ap->rank - 1].size;
  5106. if (tmp_val < pointers_number) {
  5107. LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent elements");
  5108. return false;
  5109. }
  5110. }
  5111. else if ((flags & (1<<flag_extent_elements_is_scalar)) != 0) {
  5112. ext_elements.val = (int64_t)vd3->extent_elements;
  5113. }
  5114. else {
  5115. ext_elements.val = m_vars[i].count;
  5116. }
  5117. // ALLOC_IF
  5118. if ((flags & (1<<flag_alloc_if_is_array)) != 0) {
  5119. ap = static_cast<const Arr_Desc*>(vd3->alloc_if_array);
  5120. get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_if.offset,
  5121. alloc_if.size, tmp_val, alloc_if.ranges);
  5122. alloc_if.base = reinterpret_cast<char*>(ap->base);
  5123. alloc_if.el_size = ap->dim[ap->rank - 1].size;
  5124. if (tmp_val < pointers_number) {
  5125. LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_if");
  5126. return false;
  5127. }
  5128. }
  5129. else {
  5130. alloc_if.val = m_vars[i].alloc_if;
  5131. }
  5132. // FREE_IF
  5133. if ((flags & (1<<flag_free_if_is_array)) != 0) {
  5134. ap = static_cast<const Arr_Desc*>(vd3->free_if_array);
  5135. get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, free_if.offset,
  5136. free_if.size, tmp_val, free_if.ranges);
  5137. free_if.base = reinterpret_cast<char*>(ap->base);
  5138. free_if.el_size = ap->dim[ap->rank - 1].size;
  5139. if (tmp_val < pointers_number) {
  5140. LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "free_if");
  5141. return false;
  5142. }
  5143. }
  5144. else {
  5145. free_if.val = m_vars[i].free_if;
  5146. }
  5147. // ALIGN
  5148. if ((flags & (1<<flag_align_is_array)) != 0) {
  5149. ap = static_cast<const Arr_Desc*>(vd3->align_array);
  5150. get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, align.offset,
  5151. align.size, tmp_val, align.ranges);
  5152. align.base = reinterpret_cast<char*>(ap->base);
  5153. align.el_size = ap->dim[ap->rank - 1].size;
  5154. if (tmp_val < pointers_number) {
  5155. LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "align");
  5156. return false;
  5157. }
  5158. }
  5159. else {
  5160. align.val = m_vars[i].align;
  5161. }
  5162. // 3.1 INTO
  5163. if (m_vars[i].into) {
  5164. ap = static_cast<const Arr_Desc*>(m_vars[i].into);
  5165. get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into.offset,
  5166. into.size, tmp_val, into.ranges);
  5167. into.base = reinterpret_cast<char*>(ap->base);
  5168. if (tmp_val < pointers_number) {
  5169. LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into");
  5170. return false;
  5171. }
  5172. }
  5173. // 3.2 INTO_START
  5174. if ((flags & (1<<flag_into_start_is_array)) != 0) {
  5175. ap = static_cast<const Arr_Desc*>(vd3->into_start);
  5176. get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into_start.offset,
  5177. into_start.size, tmp_val, into_start.ranges);
  5178. into_start.base = reinterpret_cast<char*>(ap->base);
  5179. into_start.el_size = ap->dim[ap->rank - 1].size;
  5180. if (tmp_val < pointers_number) {
  5181. LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent start");
  5182. return false;
  5183. }
  5184. }
  5185. else if ((flags & (1<<flag_into_start_is_scalar)) != 0) {
  5186. into_start.val = (int64_t)vd3->into_start;
  5187. }
  5188. else {
  5189. into_start.val = 0;
  5190. }
  5191. // 3.3 INTO_ELEMENTS
  5192. if ((flags & (1<<flag_into_elements_is_array)) != 0) {
  5193. ap = static_cast<const Arr_Desc*>(vd3->into_elements);
  5194. get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into_elem.offset,
  5195. into_elem.size, tmp_val, into_elem.ranges);
  5196. into_elem.base = reinterpret_cast<char*>(ap->base);
  5197. into_elem.el_size = ap->dim[ap->rank - 1].size;
  5198. if (tmp_val < pointers_number) {
  5199. LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent elements");
  5200. return false;
  5201. }
  5202. }
  5203. else if ((flags & (1<<flag_into_elements_is_scalar)) != 0) {
  5204. into_elem.val = (int64_t)vd3->into_elements;
  5205. }
  5206. else {
  5207. into_elem.val = m_vars[i].count;
  5208. }
  5209. // alloc_start
  5210. if ((flags & (1<<flag_alloc_start_is_array)) != 0) {
  5211. ap = static_cast<const Arr_Desc*>(vd3->alloc_start);
  5212. get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size,
  5213. alloc_start.offset, alloc_start.size, tmp_val,
  5214. alloc_start.ranges);
  5215. alloc_start.base = reinterpret_cast<char*>(ap->base);
  5216. alloc_start.el_size = ap->dim[ap->rank - 1].size;
  5217. if (tmp_val < pointers_number) {
  5218. LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent start");
  5219. return false;
  5220. }
  5221. }
  5222. else if ((flags & (1<<flag_alloc_start_is_scalar)) != 0) {
  5223. alloc_start.val = (int64_t)vd3->alloc_start;
  5224. }
  5225. else {
  5226. alloc_start.val = 0;
  5227. }
  5228. // alloc_elem
  5229. if ((flags & (1<<flag_alloc_elements_is_array)) != 0) {
  5230. ap = static_cast<const Arr_Desc*>(vd3->alloc_elements);
  5231. get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_elem.offset,
  5232. alloc_elem.size, tmp_val, alloc_elem.ranges);
  5233. alloc_elem.base = reinterpret_cast<char*>(ap->base);
  5234. alloc_elem.el_size = ap->dim[ap->rank - 1].size;
  5235. if (tmp_val < pointers_number) {
  5236. LIBOFFLOAD_ERROR(c_pointer_array_mismatch,
  5237. "alloc_extent elements");
  5238. return false;
  5239. }
  5240. }
  5241. else if ((flags & (1<<flag_alloc_elements_is_scalar)) != 0) {
  5242. alloc_elem.val = (int64_t)vd3->alloc_elements;
  5243. }
  5244. else {
  5245. alloc_elem.val = 0;
  5246. }
  5247. for (int k = 0; k < pointers_number; k++) {
  5248. int type = flags & 0x3f;
  5249. int type_src, type_dst;
  5250. // Get new values
  5251. // type_src, type_dst
  5252. type_src = type_dst = (type == c_data_ptr_array) ?
  5253. c_data_ptr : (type == c_func_ptr_array) ?
  5254. c_func_ptr : (type == c_void_ptr_array) ?
  5255. c_void_ptr : (type == c_string_ptr_array) ?
  5256. c_string_ptr : 0;
  5257. // Get ptr val
  5258. if (!ptr.read_next(true)) {
  5259. break;
  5260. }
  5261. else {
  5262. ptr.val = (void*)(ptr.base + ptr.offset);
  5263. }
  5264. // !!! If we got error at phase of reading - it's an internal
  5265. // !!! error, as we must detect mismatch before
  5266. // Get into val
  5267. if (m_vars[i].into) {
  5268. if (!into.read_next(true)) {
  5269. LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into");
  5270. LIBOFFLOAD_ABORT;
  5271. }
  5272. else {
  5273. into.val = (void*)(into.base + into.offset);
  5274. }
  5275. }
  5276. // Get other components of the clause
  5277. if (!ext_start.read_next(flags & (1<<flag_extent_start_is_array))) {
  5278. LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent start");
  5279. LIBOFFLOAD_ABORT;
  5280. }
  5281. if (!ext_elements.read_next(
  5282. flags & (1<<flag_extent_elements_is_array))) {
  5283. LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent elements");
  5284. LIBOFFLOAD_ABORT;
  5285. }
  5286. if (!alloc_if.read_next(flags & (1<<flag_alloc_if_is_array))) {
  5287. LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_if");
  5288. LIBOFFLOAD_ABORT;
  5289. }
  5290. if (!free_if.read_next(flags & (1<<flag_free_if_is_array))) {
  5291. LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "free_if");
  5292. LIBOFFLOAD_ABORT;
  5293. }
  5294. if (!align.read_next(flags & (1<<flag_align_is_array))) {
  5295. LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "align");
  5296. LIBOFFLOAD_ABORT;
  5297. }
  5298. if (!into_start.read_next(flags & (1<<flag_into_start_is_array))) {
  5299. LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent start");
  5300. LIBOFFLOAD_ABORT;
  5301. }
  5302. if (!into_elem.read_next(flags & (1<<flag_into_elements_is_array))) {
  5303. LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent elements");
  5304. LIBOFFLOAD_ABORT;
  5305. }
  5306. if (!alloc_start.read_next(flags & (1<<flag_alloc_start_is_array))) {
  5307. LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent start");
  5308. LIBOFFLOAD_ABORT;
  5309. }
  5310. if (!alloc_elem.read_next(
  5311. flags & (1<<flag_alloc_elements_is_array))) {
  5312. LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent elements");
  5313. LIBOFFLOAD_ABORT;
  5314. }
  5315. m_vars[new_index + k].direction.bits = m_vars[i].direction.bits;
  5316. m_vars[new_index + k].alloc_if = alloc_if.val;
  5317. m_vars[new_index + k].free_if = free_if.val;
  5318. m_vars[new_index + k].align = align.val;
  5319. m_vars[new_index + k].mic_offset = 0;
  5320. m_vars[new_index + k].flags.bits = m_vars[i].flags.bits;
  5321. m_vars[new_index + k].flags.is_pointer = 0;
  5322. m_vars[new_index + k].offset = 0;
  5323. m_vars[new_index + k].size = m_vars[i].size;
  5324. m_vars[new_index + k].flags.targetptr = m_vars[i].flags.targetptr;
  5325. m_vars[new_index + k].flags.preallocated =
  5326. m_vars[i].flags.preallocated;
  5327. if (ext_start.val == 0) {
  5328. m_vars[new_index + k].count = ext_elements.val;
  5329. m_vars[new_index + k].ptr = ptr.val;
  5330. if (type_src == c_string_ptr) {
  5331. m_vars[new_index + k].size = 0;
  5332. }
  5333. }
  5334. else {
  5335. m_vars[new_index + k].count = 0;
  5336. m_vars[new_index + k].ptr =
  5337. static_cast<void*>(make_arr_desc(
  5338. ptr.val,
  5339. ext_start.val,
  5340. ext_elements.val,
  5341. m_vars[i].size));
  5342. type_src = type_src == c_data_ptr ? c_cean_var_ptr :
  5343. c_string_ptr ? c_cean_var_ptr :
  5344. type_src;
  5345. if (!m_vars[i].into) {
  5346. type_dst = type_src;
  5347. }
  5348. }
  5349. if (m_vars[i].into && into_elem.val != 0) {
  5350. m_vars[new_index + k].into =
  5351. static_cast<void*>(make_arr_desc(
  5352. into.val,
  5353. into_start.val,
  5354. into_elem.val,
  5355. m_vars[i].size));
  5356. type_dst = (type == c_data_ptr_array) ? c_cean_var_ptr :
  5357. (type == c_string_ptr_array) ? c_cean_var_ptr :
  5358. type_src;
  5359. }
  5360. else {
  5361. m_vars[new_index + k].into = NULL;
  5362. }
  5363. if (alloc_elem.val != 0) {
  5364. m_vars[new_index + k].alloc =
  5365. static_cast<void*>(make_arr_desc(
  5366. ptr.val,
  5367. alloc_start.val,
  5368. alloc_elem.val,
  5369. m_vars[i].size));
  5370. }
  5371. else {
  5372. m_vars[new_index + k].alloc = NULL;
  5373. }
  5374. m_vars[new_index + k].type.src =
  5375. m_vars_extra[new_index + k].type_src = type_src;
  5376. m_vars[new_index + k].type.dst =
  5377. m_vars_extra[new_index + k].type_dst = type_dst;
  5378. m_vars_extra[new_index + k].alloc = m_vars[new_index + k].alloc;
  5379. m_vars_extra[new_index + k].is_arr_ptr_el = 1;
  5380. m_vars_extra[new_index + k].ptr_arr_offset =
  5381. src_is_for_mic ? ptr.offset : into.offset;
  5382. }
  5383. // count and alloc fields are useless at target. They can be reused
  5384. // for pointer arrays.
  5385. m_vars[i].count = pointers_number;
  5386. m_vars[i].ptr_arr_offset = new_index;
  5387. return true;
  5388. }
  5389. // Gets in dependencies of the previous offload via the stream "m_stream".
  5390. // Out argument in_deps_amount - address of amount of the dependencies
  5391. // Out argument in_deps - address of array of dependencies.
  5392. // Description of the dependencies scheme for streams :
  5393. // ----------------------------------------------------
  5394. // Every offload forms DAG consisted of 3 nodes:
  5395. // for in-transfers, runfunction and out-transfers.
  5396. // Every node has in-dependencies and out-dependencies
  5397. // Out-dependencies of previous node forms in-dependencies of current node.
  5398. // In-dependencies of 1-st node (of in-transfers) without streams is equal
  5399. // to NULL. For streams in-dependencies of 1-st node is equal to list of out
  5400. // dependencies of last node of previous offload via this stream.
  5401. // So we can say that DAGs of 2 consequent offloads via the same stream are
  5402. // connected by the way described above.
  5403. void OffloadDescriptor::get_stream_in_dependencies(
  5404. uint32_t &in_deps_amount,
  5405. COIEVENT* &in_deps
  5406. )
  5407. {
  5408. if (m_stream != no_stream && m_stream != 0) {
  5409. Stream * stream = Stream::find_stream(m_stream, false);
  5410. if (!stream) {
  5411. LIBOFFLOAD_ERROR(c_offload_no_stream,
  5412. m_device.get_logical_index());
  5413. LIBOFFLOAD_ABORT;
  5414. }
  5415. OffloadDescriptor* offload = stream->get_last_offload();
  5416. // if it's the first offload in the stream
  5417. if (!offload) {
  5418. return;
  5419. }
  5420. // if last offload has out-tranfers
  5421. if (offload->m_out_deps_total) {
  5422. in_deps_amount = offload->m_out_deps_total;
  5423. in_deps = offload->m_out_deps;
  5424. }
  5425. // last offload only sends pointer data or run function or both of them
  5426. // and has no out-transfers
  5427. else if (offload->m_in_deps_total) {
  5428. in_deps_amount = offload->m_in_deps_total;
  5429. in_deps = offload->m_in_deps;
  5430. }
  5431. }
  5432. }
  5433. static void __offload_fini_library(void)
  5434. {
  5435. OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ...\n");
  5436. if (mic_engines_total > 0) {
  5437. delete[] mic_engines;
  5438. mic_engines_total = 0;
  5439. if (mic_proxy_fs_root != 0) {
  5440. free(mic_proxy_fs_root);
  5441. mic_proxy_fs_root = 0;
  5442. }
  5443. if (knc_library_path != 0) {
  5444. free(knc_library_path);
  5445. knc_library_path = 0;
  5446. }
  5447. if (knl_library_path != 0) {
  5448. free(knl_library_path);
  5449. knl_library_path = 0;
  5450. }
  5451. // destroy thread key
  5452. thread_key_delete(mic_thread_key);
  5453. }
  5454. // unload COI library
  5455. if (COI::is_available) {
  5456. COI::fini();
  5457. }
  5458. OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ... done\n");
  5459. }
  5460. typedef std::pair<int, micLcpuMask*> deviceLcpu;
  5461. typedef std::list<deviceLcpu> deviceLcpuList;
  5462. static int process_offload_devices(
  5463. const char *env_var,
  5464. uint32_t num_devices,
  5465. deviceLcpuList &device_cpu_list
  5466. )
  5467. {
  5468. // Value is composed of comma separated physical device index
  5469. // optionally qualified by logical CPU subset, e.g. 0[60,70-80]
  5470. char *buf = strdup(env_var);
  5471. if (buf == NULL)
  5472. LIBOFFLOAD_ERROR(c_malloc);
  5473. char *str = buf;
  5474. bool device_set_finished = false;
  5475. int num_devices_specified = 0;
  5476. do {
  5477. char *dev_ptr = str;
  5478. int dev_len = strcspn(str, "[,");
  5479. micLcpuMask* cpu_mask = 0;
  5480. if (str[dev_len] == '[') {
  5481. // CPU subset specified
  5482. cpu_mask = new micLcpuMask;
  5483. cpu_mask->reset();
  5484. char *cpu_ptr = str + dev_len + 1;
  5485. do {
  5486. int64_t cnum;
  5487. bool cpu_set_finished = false;
  5488. int cpu_len = strcspn(cpu_ptr, ",-]");
  5489. if (cpu_ptr[cpu_len] == ',' || cpu_ptr[cpu_len] == ']') {
  5490. // A single CPU specified
  5491. cpu_set_finished = cpu_ptr[cpu_len] == ']';
  5492. cpu_ptr[cpu_len] = '\0';
  5493. // Convert cpu string to an int
  5494. if (!__offload_parse_int_string(cpu_ptr, cnum)) {
  5495. LIBOFFLOAD_ERROR(c_mic_init7);
  5496. delete cpu_mask;
  5497. free(buf);
  5498. return 0;
  5499. } else {
  5500. OFFLOAD_DEBUG_TRACE(3,
  5501. "Single CPU %d selected\n", cnum);
  5502. cpu_mask->set(cnum);
  5503. }
  5504. cpu_ptr = cpu_ptr + cpu_len + 1;
  5505. if (cpu_set_finished) {
  5506. break;
  5507. }
  5508. } else if (cpu_ptr[cpu_len] == '-') {
  5509. int64_t range_start, range_end;
  5510. // A range of CPUs specified
  5511. cpu_ptr[cpu_len] = '\0';
  5512. // Convert cpu string to an int
  5513. if (!__offload_parse_int_string(cpu_ptr, range_start)) {
  5514. LIBOFFLOAD_ERROR(c_mic_init8);
  5515. delete cpu_mask;
  5516. free(buf);
  5517. return 0;
  5518. } else {
  5519. OFFLOAD_DEBUG_TRACE(3,
  5520. "Start of CPU range specified as %d\n",
  5521. range_start);
  5522. cpu_ptr = cpu_ptr + cpu_len + 1;
  5523. cpu_len = strcspn(cpu_ptr, ",]");
  5524. if (cpu_ptr[cpu_len] == ',' ||
  5525. cpu_ptr[cpu_len] == ']') {
  5526. cpu_set_finished = cpu_ptr[cpu_len] == ']';
  5527. cpu_ptr[cpu_len] = '\0';
  5528. // Convert cpu string to an int
  5529. if (!__offload_parse_int_string(
  5530. cpu_ptr, range_end)) {
  5531. LIBOFFLOAD_ERROR(c_mic_init9);
  5532. delete cpu_mask;
  5533. free(buf);
  5534. return 0;
  5535. } else {
  5536. OFFLOAD_DEBUG_TRACE(3,
  5537. "End of CPU range specified as %d\n",
  5538. range_end);
  5539. if (range_end < range_start) {
  5540. LIBOFFLOAD_ERROR(c_mic_init10);
  5541. delete cpu_mask;
  5542. free(buf);
  5543. return 0;
  5544. } else {
  5545. for (int i=range_start; i<=range_end; i++)
  5546. {
  5547. OFFLOAD_DEBUG_TRACE(3,
  5548. "CPU %d selected as part of range\n",
  5549. i);
  5550. cpu_mask->set(i);
  5551. }
  5552. cpu_ptr = cpu_ptr + cpu_len + 1;
  5553. if (cpu_set_finished) {
  5554. break;
  5555. }
  5556. }
  5557. }
  5558. } else {
  5559. LIBOFFLOAD_ERROR(c_mic_init10);
  5560. delete cpu_mask;
  5561. free(buf);
  5562. return 0;
  5563. }
  5564. }
  5565. } else {
  5566. // Error: expected , or - or ]
  5567. LIBOFFLOAD_ERROR(c_mic_init11);
  5568. delete cpu_mask;
  5569. free(buf);
  5570. return 0;
  5571. }
  5572. } while (true);
  5573. // Point to next device specification
  5574. str = cpu_ptr;
  5575. if (*str == '\0') {
  5576. device_set_finished = true;
  5577. } else {
  5578. // Skip the comma after a device specification
  5579. str++;
  5580. }
  5581. } else if (str[dev_len] == ',') {
  5582. // CPU subset not specified
  5583. // Point to next device specification
  5584. str = str + dev_len + 1;
  5585. } else {
  5586. // No more device specifications
  5587. device_set_finished = true;
  5588. }
  5589. dev_ptr[dev_len] = '\0';
  5590. // Convert device string to an int
  5591. int64_t num;
  5592. if (!__offload_parse_int_string(dev_ptr, num)) {
  5593. LIBOFFLOAD_ERROR(c_mic_init5);
  5594. delete cpu_mask;
  5595. free(buf);
  5596. return 0;
  5597. }
  5598. if (num < 0 || num >= num_devices) {
  5599. LIBOFFLOAD_ERROR(c_mic_init6, num);
  5600. delete cpu_mask;
  5601. free(buf);
  5602. return 0;
  5603. }
  5604. OFFLOAD_DEBUG_TRACE(3, "Offloadable MIC = %d\n", num);
  5605. // Save the specified physical device and cpu mask
  5606. device_cpu_list.push_back(make_pair(num, cpu_mask));
  5607. num_devices_specified++;
  5608. if (device_set_finished) {
  5609. break;
  5610. }
  5611. } while (true);
  5612. free(buf);
  5613. return num_devices_specified;
  5614. }
  5615. static void __offload_init_library_once(void)
  5616. {
  5617. COIRESULT res;
  5618. uint32_t num_devices;
  5619. deviceLcpuList device_cpu_list;
  5620. prefix = report_get_message_str(c_report_host);
  5621. // initialize trace
  5622. const char *env_var = getenv(htrace_envname);
  5623. if (env_var != 0 && *env_var != '\0') {
  5624. int64_t new_val;
  5625. if (__offload_parse_int_string(env_var, new_val)) {
  5626. console_enabled = new_val & 0x0f;
  5627. }
  5628. }
  5629. OFFLOAD_DEBUG_TRACE(2, "---- Start of environment variable processing\n");
  5630. env_var = getenv(offload_report_envname);
  5631. if (env_var != 0 && *env_var != '\0') {
  5632. OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
  5633. offload_report_envname, env_var);
  5634. int64_t env_val;
  5635. if (__offload_parse_int_string(env_var, env_val)) {
  5636. if (env_val == OFFLOAD_REPORT_1 ||
  5637. env_val == OFFLOAD_REPORT_2 ||
  5638. env_val == OFFLOAD_REPORT_3) {
  5639. offload_report_level = env_val;
  5640. OFFLOAD_DEBUG_TRACE(2, "Offload report level set to %d\n",
  5641. offload_report_level);
  5642. }
  5643. else {
  5644. LIBOFFLOAD_ERROR(c_invalid_env_report_value,
  5645. offload_report_envname);
  5646. }
  5647. }
  5648. else {
  5649. LIBOFFLOAD_ERROR(c_invalid_env_var_int_value,
  5650. offload_report_envname);
  5651. }
  5652. }
  5653. else if (!offload_report_level) {
  5654. env_var = getenv(timer_envname);
  5655. if (env_var != 0 && *env_var != '\0') {
  5656. OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n", timer_envname, env_var);
  5657. timer_enabled = atoi(env_var);
  5658. OFFLOAD_DEBUG_TRACE(2, "Timer enable flag set to %d\n",
  5659. timer_enabled);
  5660. }
  5661. }
  5662. // initialize COI
  5663. if (!COI::init()) {
  5664. return;
  5665. }
  5666. // Process OFFLOAD_NODES, specification of physical MICs available
  5667. env_var = getenv("OFFLOAD_NODES");
  5668. if (env_var != 0 && *env_var != '\0') {
  5669. OFFLOAD_DEBUG_TRACE(2, "---- OFFLOAD_NODES=%s\n", env_var);
  5670. // Pass env var on to COI
  5671. char * new_env_var =
  5672. (char*) malloc(sizeof("COI_OFFLOAD_NODES=") +
  5673. strlen(env_var) + 1);
  5674. if (new_env_var == NULL)
  5675. LIBOFFLOAD_ERROR(c_malloc);
  5676. sprintf(new_env_var, "COI_OFFLOAD_NODES=%s", env_var);
  5677. putenv(new_env_var);
  5678. OFFLOAD_DEBUG_TRACE(2, "Setting COI_OFFLOAD_NODES = %s \n", getenv("COI_OFFLOAD_NODES"));
  5679. // value is composed of comma separated physical device indexes
  5680. char *buf = strdup(env_var);
  5681. if (buf == NULL)
  5682. LIBOFFLOAD_ERROR(c_malloc);
  5683. char *str, *ptr;
  5684. int num_mics = 0;
  5685. for (str = strtok_r(buf, ",", &ptr); str != 0;
  5686. str = strtok_r(0, ",", &ptr)) {
  5687. // count this MIC
  5688. num_mics++;
  5689. }
  5690. OFFLOAD_DEBUG_TRACE(2, "Number of offloadable MICs = %d\n", num_mics);
  5691. free(buf);
  5692. }
  5693. else {
  5694. OFFLOAD_DEBUG_TRACE(2, "OFFLOAD_NODES is not set\n");
  5695. }
  5696. // get number of devices installed in the system
  5697. res = COI::EngineGetCount(COI_ISA_MIC, &num_devices);
  5698. if (res != COI_SUCCESS) {
  5699. return;
  5700. }
  5701. if (num_devices > MIC_ENGINES_MAX) {
  5702. num_devices = MIC_ENGINES_MAX;
  5703. }
  5704. // Determine devices & cpus that can be used for offloading
  5705. env_var = getenv("OFFLOAD_DEVICES");
  5706. if (env_var != 0 && *env_var != '\0') {
  5707. OFFLOAD_DEBUG_TRACE(2, "---- OFFLOAD_DEVICES=%s\n", env_var);
  5708. if (strcasecmp(env_var, "none") != 0) {
  5709. mic_engines_total =
  5710. process_offload_devices(
  5711. env_var, num_devices, device_cpu_list);
  5712. if (mic_engines_total > 0) {
  5713. OFFLOAD_DEBUG_TRACE(2, "Valid value, %d device(s) specified\n",
  5714. mic_engines_total);
  5715. }
  5716. else {
  5717. OFFLOAD_DEBUG_TRACE(2, "Invalid value, will not offload\n");
  5718. return;
  5719. }
  5720. }
  5721. else {
  5722. // No need to continue since no offload devices
  5723. return;
  5724. }
  5725. }
  5726. else {
  5727. OFFLOAD_DEBUG_TRACE(2, "OFFLOAD_DEVICES is not set\n");
  5728. }
  5729. if (mic_engines_total == 0) {
  5730. // Fallback to using all available devices and all CPUs on each
  5731. OFFLOAD_DEBUG_TRACE(2, "Fallback to all devices\n");
  5732. device_cpu_list.clear();
  5733. mic_engines_total = 0;
  5734. for (int i = 0; i < num_devices; i++) {
  5735. COIENGINE engine;
  5736. res = COI::EngineGetHandle(COI_ISA_MIC, i, &engine);
  5737. if (res == COI_SUCCESS) {
  5738. device_cpu_list.push_back(make_pair(i, (micLcpuMask*)0));
  5739. OFFLOAD_DEBUG_TRACE(2, "Device %d is available\n", i);
  5740. mic_engines_total++;
  5741. }
  5742. }
  5743. }
  5744. // no need to continue if there are no devices to offload to
  5745. if (mic_engines_total <= 0) {
  5746. return;
  5747. }
  5748. // Initialize indexes for available devices
  5749. mic_engines = new Engine[mic_engines_total];
  5750. std::list<deviceLcpu>::iterator deviceIterator;
  5751. int l_idx = 0;
  5752. for (deviceIterator = device_cpu_list.begin();
  5753. deviceIterator != device_cpu_list.end();
  5754. deviceIterator++)
  5755. {
  5756. deviceLcpu device_mask_pair = *deviceIterator;
  5757. int device_num = device_mask_pair.first;
  5758. micLcpuMask *device_mask = device_mask_pair.second;
  5759. mic_engines[l_idx].set_indexes(l_idx, device_num);
  5760. mic_engines[l_idx].set_cpu_mask(device_mask);
  5761. OFFLOAD_DEBUG_TRACE(2,
  5762. "Logical MIC%d => Physical MIC%d\n", l_idx, device_num);
  5763. if (device_mask != NULL) {
  5764. std::string cpu_string =
  5765. device_mask->to_string<
  5766. char,
  5767. std::string::traits_type,
  5768. std::string::allocator_type>();
  5769. OFFLOAD_DEBUG_TRACE(2, " CPUs: %s\n", cpu_string.data());
  5770. }
  5771. else {
  5772. OFFLOAD_DEBUG_TRACE(2, " CPUs: all\n");
  5773. }
  5774. l_idx++;
  5775. }
  5776. // Get DMA channel count to pass it to COI
  5777. env_var = getenv("OFFLOAD_DMA_CHANNEL_COUNT");
  5778. if (env_var != 0 && *env_var != '\0') {
  5779. OFFLOAD_DEBUG_TRACE(2, "---- OFFLOAD_DMA_CHANNEL_COUNT=%s\n", env_var);
  5780. int64_t new_val;
  5781. if (__offload_parse_int_string(env_var, new_val)) {
  5782. mic_dma_channel_count = new_val;
  5783. OFFLOAD_DEBUG_TRACE(2, "Using %d DMA channels\n",
  5784. mic_dma_channel_count);
  5785. }
  5786. else {
  5787. LIBOFFLOAD_ERROR(c_invalid_env_var_value,
  5788. "OFFLOAD_DMA_CHANNEL_COUNT");
  5789. }
  5790. }
  5791. else {
  5792. OFFLOAD_DEBUG_TRACE(2, "OFFLOAD_DMA_CHANNEL_COUNT is not set\n");
  5793. }
  5794. // Set COI_HOST_THREAD_AFFINITY if OFFLOAD_HOST_THREAD_AFFINITY is set.
  5795. // Use putenv instead of setenv as Windows has no setenv.
  5796. // Note: putenv requires its argument can't be freed or modified.
  5797. // So no free after call to putenv or elsewhere.
  5798. env_var = getenv("OFFLOAD_HOST_THREAD_AFFINITY");
  5799. if (env_var != 0 && *env_var != '\0') {
  5800. OFFLOAD_DEBUG_TRACE(2, "---- OFFLOAD_HOST_THREAD_AFFINITY=%s\n", env_var);
  5801. char * new_env_var =
  5802. (char*) malloc(sizeof("COI_HOST_THREAD_AFFINITY=") +
  5803. strlen(env_var) + 1);
  5804. if (new_env_var == NULL)
  5805. LIBOFFLOAD_ERROR(c_malloc);
  5806. sprintf(new_env_var, "COI_HOST_THREAD_AFFINITY=%s", env_var);
  5807. putenv(new_env_var);
  5808. OFFLOAD_DEBUG_TRACE(2, "Setting COI_HOST_THREAD_AFFINITY = %s \n",
  5809. getenv("COI_HOST_THREAD_AFFINITY"));
  5810. }
  5811. else {
  5812. OFFLOAD_DEBUG_TRACE(2, "OFFLOAD_HOST_THREAD_AFFINITY is not set\n");
  5813. }
  5814. // library search path for KNC device binaries
  5815. env_var = getenv("MIC_LD_LIBRARY_PATH");
  5816. if (env_var != 0) {
  5817. OFFLOAD_DEBUG_TRACE(2, "---- MIC_LD_LIBRARY_PATH=%s\n", env_var);
  5818. knc_library_path = strdup(env_var);
  5819. if (knc_library_path == NULL)
  5820. LIBOFFLOAD_ERROR(c_malloc);
  5821. OFFLOAD_DEBUG_TRACE(2, "KNC library path set to %s\n", knc_library_path);
  5822. }
  5823. else {
  5824. OFFLOAD_DEBUG_TRACE(2, "MIC_LD_LIBRARY_PATH is not set\n");
  5825. }
  5826. // library search path for KNL device binaries
  5827. env_var = getenv("LD_LIBRARY_PATH");
  5828. if (env_var != 0) {
  5829. OFFLOAD_DEBUG_TRACE(2, "---- LD_LIBRARY_PATH=%s\n", env_var);
  5830. knl_library_path = strdup(env_var);
  5831. if (knl_library_path == NULL)
  5832. LIBOFFLOAD_ERROR(c_malloc);
  5833. OFFLOAD_DEBUG_TRACE(2, "KNL library path set to %s\n", knl_library_path);
  5834. }
  5835. else {
  5836. OFFLOAD_DEBUG_TRACE(2, "LD_LIBRARY_PATH is not set\n");
  5837. }
  5838. // memory size reserved for COI buffers
  5839. env_var = getenv("MIC_BUFFERSIZE");
  5840. if (env_var != 0 && *env_var != '\0') {
  5841. OFFLOAD_DEBUG_TRACE(2, "---- MIC_BUFFERSIZE=%s\n", env_var);
  5842. uint64_t new_size;
  5843. if (__offload_parse_size_string(env_var, new_size)) {
  5844. mic_buffer_size = new_size;
  5845. OFFLOAD_DEBUG_TRACE(2,
  5846. "Reserved memory for COI buffers set to %lld bytes\n",
  5847. mic_buffer_size);
  5848. }
  5849. else {
  5850. LIBOFFLOAD_ERROR(c_invalid_env_var_value, "MIC_BUFFERSIZE");
  5851. }
  5852. }
  5853. else {
  5854. OFFLOAD_DEBUG_TRACE(2, "MIC_BUFFERSIZE is not set\n");
  5855. }
  5856. // memory size reserved for 4K pages for COI buffers
  5857. env_var = getenv("MIC_4K_BUFFER_RESERVE_SIZE");
  5858. if (env_var != 0 && *env_var != '\0') {
  5859. OFFLOAD_DEBUG_TRACE(2, "---- MIC_4K_BUFFER_RESERVE_SIZE=%s\n", env_var);
  5860. uint64_t new_size;
  5861. if (__offload_parse_size_string(env_var, new_size)) {
  5862. mic_4k_buffer_size = new_size;
  5863. OFFLOAD_DEBUG_TRACE(2,
  5864. "Reserved memory for 4K COI buffers set to %lld bytes\n",
  5865. mic_4k_buffer_size);
  5866. }
  5867. else {
  5868. LIBOFFLOAD_ERROR(c_invalid_env_var_value, "MIC_4K_BUFFER_RESERVE_SIZE");
  5869. }
  5870. }
  5871. else {
  5872. OFFLOAD_DEBUG_TRACE(2, "MIC_4K_BUFFER_RESERVE_SIZE is not set\n");
  5873. }
  5874. // memory size reserved for 2M pages for COI buffers
  5875. env_var = getenv("MIC_2M_BUFFER_RESERVE_SIZE");
  5876. if (env_var != 0 && *env_var != '\0') {
  5877. OFFLOAD_DEBUG_TRACE(2, "---- MIC_2M_BUFFER_RESERVE_SIZE=%s\n", env_var);
  5878. uint64_t new_size;
  5879. if (__offload_parse_size_string(env_var, new_size)) {
  5880. mic_2m_buffer_size = new_size;
  5881. OFFLOAD_DEBUG_TRACE(2,
  5882. "Reserved memory for 2M COI buffers set to %lld bytes\n",
  5883. mic_2m_buffer_size);
  5884. }
  5885. else {
  5886. LIBOFFLOAD_ERROR(c_invalid_env_var_value,
  5887. "MIC_2M_BUFFER_RESERVE_SIZE");
  5888. }
  5889. }
  5890. else {
  5891. OFFLOAD_DEBUG_TRACE(2, "MIC_2M_BUFFER_RESERVE_SIZE is not set\n");
  5892. }
  5893. // determine stacksize for the pipeline on the device
  5894. env_var = getenv("MIC_STACKSIZE");
  5895. if (env_var != 0 && *env_var != '\0') {
  5896. OFFLOAD_DEBUG_TRACE(2, "---- MIC_STACKSIZE=%s\n", env_var);
  5897. uint64_t new_size;
  5898. if (__offload_parse_size_string(env_var, new_size) &&
  5899. (new_size >= 16384) && ((new_size & 4095) == 0)) {
  5900. mic_stack_size = new_size;
  5901. OFFLOAD_DEBUG_TRACE(2, "MIC stack size set to %lld bytes\n",
  5902. mic_stack_size);
  5903. }
  5904. else {
  5905. LIBOFFLOAD_ERROR(c_mic_init3);
  5906. }
  5907. }
  5908. else {
  5909. OFFLOAD_DEBUG_TRACE(2, "MIC_STACKSIZE is not set\n");
  5910. }
  5911. // proxy I/O
  5912. env_var = getenv("MIC_PROXY_IO");
  5913. if (env_var != 0 && *env_var != '\0') {
  5914. OFFLOAD_DEBUG_TRACE(2, "---- MIC_PROXY_IO=%s\n", env_var);
  5915. int64_t new_val;
  5916. if (__offload_parse_int_string(env_var, new_val)) {
  5917. mic_proxy_io = new_val;
  5918. OFFLOAD_DEBUG_TRACE(2, "MIC proxy i/o set to %s\n",
  5919. mic_proxy_io);
  5920. }
  5921. else {
  5922. LIBOFFLOAD_ERROR(c_invalid_env_var_int_value, "MIC_PROXY_IO");
  5923. }
  5924. }
  5925. else {
  5926. OFFLOAD_DEBUG_TRACE(2, "MIC_PROXY_IO is not set\n");
  5927. }
  5928. env_var = getenv("MIC_PROXY_FS_ROOT");
  5929. if (env_var != 0 && *env_var != '\0') {
  5930. OFFLOAD_DEBUG_TRACE(2, "---- MIC_PROXY_FS_ROOT=%s\n", env_var);
  5931. mic_proxy_fs_root = strdup(env_var);
  5932. if (mic_proxy_fs_root == NULL)
  5933. LIBOFFLOAD_ERROR(c_malloc);
  5934. OFFLOAD_DEBUG_TRACE(2, "MIC proxy fs root set to %s\n",
  5935. mic_proxy_fs_root);
  5936. }
  5937. else {
  5938. OFFLOAD_DEBUG_TRACE(2, "MIC_PROXY_FS_ROOT is not set\n");
  5939. }
  5940. // Prepare environment for the target process using the following
  5941. // rules
  5942. // - If MIC_ENV_PREFIX is set then any environment variable on the
  5943. // host which has that prefix are copied to the device without
  5944. // the prefix.
  5945. // All other host environment variables are ignored.
  5946. // - If MIC_ENV_PREFIX is not set or if MIC_ENV_PREFIX="" then host
  5947. // environment is duplicated.
  5948. env_var = getenv("MIC_ENV_PREFIX");
  5949. if (env_var != 0 && *env_var != '\0') {
  5950. OFFLOAD_DEBUG_TRACE(2, "---- MIC_ENV_PREFIX=%s\n", env_var);
  5951. mic_env_vars.set_prefix(env_var);
  5952. int len = strlen(env_var);
  5953. for (int i = 0; environ[i] != 0; i++) {
  5954. if (strncmp(environ[i], env_var, len) == 0 &&
  5955. strncmp(environ[i], "MIC_LD_LIBRARY_PATH", 19) != 0 &&
  5956. environ[i][len] != '=') {
  5957. mic_env_vars.analyze_env_var(environ[i]);
  5958. }
  5959. }
  5960. }
  5961. else {
  5962. OFFLOAD_DEBUG_TRACE(2, "MIC_ENV_PREFIX is not set\n");
  5963. }
  5964. // create key for thread data
  5965. if (thread_key_create(&mic_thread_key, Engine::destroy_thread_data)) {
  5966. LIBOFFLOAD_ERROR(c_mic_init4, errno);
  5967. return;
  5968. }
  5969. // cpu frequency
  5970. cpu_frequency = COI::PerfGetCycleFrequency();
  5971. env_var = getenv(mic_use_2mb_buffers_envname);
  5972. if (env_var != 0 && *env_var != '\0') {
  5973. OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
  5974. mic_use_2mb_buffers_envname, env_var);
  5975. uint64_t new_size;
  5976. if (__offload_parse_size_string(env_var, new_size)) {
  5977. __offload_use_2mb_buffers = new_size;
  5978. OFFLOAD_DEBUG_TRACE(2,
  5979. "Threshold for use of 2M buffers set to %lld\n",
  5980. __offload_use_2mb_buffers);
  5981. }
  5982. else {
  5983. LIBOFFLOAD_ERROR(c_invalid_env_var_value,
  5984. mic_use_2mb_buffers_envname);
  5985. }
  5986. }
  5987. else {
  5988. OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", mic_use_2mb_buffers_envname);
  5989. }
  5990. env_var = getenv(mic_use_async_buffer_write_envname);
  5991. if (env_var != 0 && *env_var != '\0') {
  5992. OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
  5993. mic_use_async_buffer_write_envname, env_var);
  5994. uint64_t new_size;
  5995. if (__offload_parse_size_string(env_var, new_size)) {
  5996. __offload_use_async_buffer_write = new_size;
  5997. OFFLOAD_DEBUG_TRACE(2,
  5998. "Threshold for async buffer write set to %lld\n",
  5999. __offload_use_async_buffer_write);
  6000. }
  6001. }
  6002. else {
  6003. OFFLOAD_DEBUG_TRACE(2, "%s is not set\n",
  6004. mic_use_async_buffer_write_envname);
  6005. }
  6006. env_var = getenv(mic_use_async_buffer_read_envname);
  6007. if (env_var != 0 && *env_var != '\0') {
  6008. OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
  6009. mic_use_async_buffer_read_envname, env_var);
  6010. uint64_t new_size;
  6011. if (__offload_parse_size_string(env_var, new_size)) {
  6012. __offload_use_async_buffer_read = new_size;
  6013. OFFLOAD_DEBUG_TRACE(2,
  6014. "Threshold for async buffer read set to %lld\n",
  6015. __offload_use_async_buffer_read);
  6016. }
  6017. }
  6018. else {
  6019. OFFLOAD_DEBUG_TRACE(2, "%s is not set\n",
  6020. mic_use_async_buffer_read_envname);
  6021. }
  6022. // mic initialization type
  6023. env_var = getenv(offload_init_envname);
  6024. if (env_var != 0 && *env_var != '\0') {
  6025. OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
  6026. offload_init_envname, env_var);
  6027. if (strcmp(env_var, "on_offload") == 0) {
  6028. __offload_init_type = c_init_on_offload;
  6029. OFFLOAD_DEBUG_TRACE(2,
  6030. "A MIC device will be initialized "
  6031. "on first offload to that device\n");
  6032. }
  6033. else if (strcmp(env_var, "on_offload_all") == 0) {
  6034. __offload_init_type = c_init_on_offload_all;
  6035. OFFLOAD_DEBUG_TRACE(2,
  6036. "All MIC devices will be initialized "
  6037. "on first offload to any device\n");
  6038. }
  6039. else if (strcmp(env_var, "on_start") == 0) {
  6040. __offload_init_type = c_init_on_start;
  6041. OFFLOAD_DEBUG_TRACE(2,
  6042. "All MIC devices will be initialized "
  6043. "at program start\n");
  6044. }
  6045. else {
  6046. LIBOFFLOAD_ERROR(c_invalid_env_var_value, offload_init_envname);
  6047. }
  6048. }
  6049. else {
  6050. OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", offload_init_envname);
  6051. }
  6052. // active wait
  6053. env_var = getenv(offload_active_wait_envname);
  6054. if (env_var != 0 && *env_var != '\0') {
  6055. OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
  6056. offload_active_wait_envname, env_var);
  6057. int64_t new_val;
  6058. if (__offload_parse_int_string(env_var, new_val)) {
  6059. __offload_active_wait = new_val;
  6060. OFFLOAD_DEBUG_TRACE(2,
  6061. "Flag to poll on event completion is set to %d\n",
  6062. __offload_active_wait);
  6063. }
  6064. else {
  6065. LIBOFFLOAD_ERROR(c_invalid_env_var_int_value,
  6066. offload_active_wait_envname);
  6067. }
  6068. }
  6069. else {
  6070. OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", offload_active_wait_envname);
  6071. }
  6072. // always wait
  6073. env_var = getenv(offload_always_wait_envname);
  6074. if (env_var != 0 && *env_var != '\0') {
  6075. OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
  6076. offload_always_wait_envname, env_var);
  6077. int64_t new_val;
  6078. if (__offload_parse_int_string(env_var, new_val)) {
  6079. __offload_always_wait = new_val;
  6080. OFFLOAD_DEBUG_TRACE(2,
  6081. "Flag to poll on event completion is set to %d\n",
  6082. __offload_active_wait);
  6083. }
  6084. else {
  6085. LIBOFFLOAD_ERROR(c_invalid_env_var_int_value,
  6086. offload_always_wait_envname);
  6087. }
  6088. }
  6089. else {
  6090. OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", offload_always_wait_envname);
  6091. }
  6092. // omp device num
  6093. env_var = getenv(omp_device_num_envname);
  6094. if (env_var != 0 && *env_var != '\0') {
  6095. OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
  6096. omp_device_num_envname, env_var);
  6097. int64_t new_val;
  6098. if (__offload_parse_int_string(env_var, new_val) && new_val >= 0) {
  6099. __omp_device_num = new_val;
  6100. OFFLOAD_DEBUG_TRACE(2, "OpenMP default device number is set to %d\n",
  6101. __omp_device_num);
  6102. }
  6103. else {
  6104. LIBOFFLOAD_ERROR(c_omp_invalid_device_num_env,
  6105. omp_device_num_envname);
  6106. }
  6107. }
  6108. else {
  6109. OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", omp_device_num_envname);
  6110. }
  6111. // parallel copy of offload_transfer
  6112. env_var = getenv(parallel_copy_envname);
  6113. if (env_var != 0 && *env_var != '\0') {
  6114. OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
  6115. parallel_copy_envname, env_var);
  6116. int64_t new_val;
  6117. if (__offload_parse_int_string(env_var, new_val) && new_val >= 0) {
  6118. __offload_parallel_copy = new_val;
  6119. OFFLOAD_DEBUG_TRACE(2,
  6120. "Flag for using async buffer copy is set to %d\n",
  6121. __offload_parallel_copy);
  6122. }
  6123. else {
  6124. LIBOFFLOAD_ERROR(c_invalid_env_var_value,
  6125. parallel_copy_envname);
  6126. }
  6127. }
  6128. else {
  6129. OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", parallel_copy_envname);
  6130. }
  6131. // use COI interface for noncontiguous arrays transfer
  6132. env_var = getenv(use_coi_noncontiguous_transfer_envname);
  6133. if (env_var != 0 && *env_var != '\0') {
  6134. OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
  6135. use_coi_noncontiguous_transfer_envname, env_var);
  6136. uint64_t new_size;
  6137. if (__offload_parse_size_string(env_var, new_size)) {
  6138. __offload_use_coi_noncontiguous_transfer = new_size;
  6139. OFFLOAD_DEBUG_TRACE(2,
  6140. "Flag for using new COI noncontiguous API is set to %d\n",
  6141. __offload_use_coi_noncontiguous_transfer);
  6142. }
  6143. else {
  6144. LIBOFFLOAD_ERROR(c_invalid_env_var_value,
  6145. use_coi_noncontiguous_transfer_envname);
  6146. }
  6147. }
  6148. else {
  6149. OFFLOAD_DEBUG_TRACE(2, "%s is not set\n",
  6150. use_coi_noncontiguous_transfer_envname);
  6151. }
  6152. OFFLOAD_DEBUG_TRACE(2, "---- End of environment variable processing\n");
  6153. // init ORSL
  6154. ORSL::init();
  6155. }
  6156. extern int __offload_init_library(void)
  6157. {
  6158. // do one time intialization
  6159. static OffloadOnceControl ctrl = OFFLOAD_ONCE_CONTROL_INIT;
  6160. __offload_run_once(&ctrl, __offload_init_library_once);
  6161. // offload is available if COI is available and the number of devices > 0
  6162. bool is_available = COI::is_available && (mic_engines_total > 0);
  6163. // register pending libraries if there are any
  6164. if (is_available && __target_libs) {
  6165. mutex_locker_t locker(__target_libs_lock);
  6166. for (TargetImageList::iterator it = __target_libs_list.begin();
  6167. it != __target_libs_list.end(); it++) {
  6168. // Register library in COI
  6169. COI::ProcessRegisterLibraries(1, &it->data, &it->size,
  6170. &it->origin, &it->offset);
  6171. // add lib to all engines
  6172. for (int i = 0; i < mic_engines_total; i++) {
  6173. mic_engines[i].add_lib(*it);
  6174. }
  6175. }
  6176. __target_libs = false;
  6177. __target_libs_list.clear();
  6178. }
  6179. return is_available;
  6180. }
  6181. extern "C" bool __offload_target_image_is_executable(const void *target_image)
  6182. {
  6183. const struct Image *image = static_cast<const struct Image*>(target_image);
  6184. // decode image
  6185. const char *name = image->data;
  6186. const void *data = image->data + strlen(image->data) + 1;
  6187. // determine image type
  6188. const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data);
  6189. return (hdr->e_type == ET_EXEC);
  6190. }
  6191. extern "C" bool __offload_register_image(const void *target_image)
  6192. {
  6193. const struct Image *image = static_cast<const struct Image*>(target_image);
  6194. const void *data = image->data + strlen(image->data) + 1;
  6195. uint64_t size = image->size;
  6196. uint64_t offset = 0;
  6197. // decode image
  6198. const char *fat_name = image->data;
  6199. char *mic_name = (char *) malloc(strlen(image->data) + 1);
  6200. char *host_name = (char *) malloc(strlen(image->data));
  6201. int i;
  6202. if ((mic_name == NULL) || (host_name == NULL))
  6203. LIBOFFLOAD_ERROR(c_malloc);
  6204. // The origin name is the name of the file on the host
  6205. // this is used by Vtune, since it is a fat binary we
  6206. // use the host file name of the fat binary.
  6207. // Driver prepends the host file name ending with "?"
  6208. // to the image->data name so need to extract the string
  6209. // name format: <mic_name>?<origin>
  6210. // Get <mic_name>
  6211. i = 0;
  6212. while ((*fat_name != '\0') && (*fat_name != '?')) {
  6213. mic_name[i] = *fat_name;
  6214. fat_name++;
  6215. i++;
  6216. }
  6217. // Remove the host file name by inserting end of string marker
  6218. mic_name[i] = '\0';
  6219. // Get <host_name>
  6220. if (*fat_name == '?') {
  6221. // The string following "?" is the name of the host file name.
  6222. fat_name++;
  6223. i = 0;
  6224. while (*fat_name != '\0') {
  6225. host_name[i] = *fat_name;
  6226. fat_name++;
  6227. i++;
  6228. }
  6229. host_name[i] = '\0';
  6230. }
  6231. else {
  6232. // Windows current does not have host name
  6233. free(host_name);
  6234. host_name = 0;
  6235. }
  6236. // our actions depend on the image type
  6237. const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data);
  6238. switch (hdr->e_type) {
  6239. case ET_EXEC:
  6240. __current_image_is_dll = false;
  6241. // Each offload application is supposed to have only one target
  6242. // image representing target executable.
  6243. // No thread synchronization is required here as the initialization
  6244. // code is always executed in a single thread.
  6245. if (__target_exe != 0) {
  6246. LIBOFFLOAD_ERROR(c_multiple_target_exes);
  6247. exit(1);
  6248. }
  6249. __target_exe = new TargetImage(mic_name, data, size, host_name, offset);
  6250. // Registration code for execs is always called from the context
  6251. // of main and thus we can safely call any function here,
  6252. // including LoadLibrary API on windows. This is the place where
  6253. // we do the offload library initialization.
  6254. if (__offload_init_library()) {
  6255. // initialize engine if init_type is on_start
  6256. if (__offload_init_type == c_init_on_start) {
  6257. for (int i = 0; i < mic_engines_total; i++) {
  6258. mic_engines[i].init();
  6259. }
  6260. }
  6261. }
  6262. return mic_engines_total > 0;
  6263. case ET_DYN:
  6264. {
  6265. char * fullname = NULL;
  6266. __current_image_is_dll = true;
  6267. // We add the library to a list of pending libraries
  6268. __target_libs_lock.lock();
  6269. __target_libs = true;
  6270. __target_libs_list.push_back(
  6271. TargetImage(mic_name, data, size, fullname, offset));
  6272. __target_libs_lock.unlock();
  6273. // If __target_exe is set, then main has started running
  6274. // If not main, then we can't do anything useful here
  6275. // because this registration code is called from DllMain
  6276. // context (on windows).
  6277. if (__target_exe != 0) {
  6278. // There is no need to delay loading the library
  6279. if (!__offload_init_library()) {
  6280. // Couldn't validate library as a fat offload library
  6281. LIBOFFLOAD_ERROR(c_unknown_binary_type);
  6282. exit(1);
  6283. }
  6284. }
  6285. return true;
  6286. }
  6287. default:
  6288. // something is definitely wrong, issue an error and exit
  6289. LIBOFFLOAD_ERROR(c_unknown_binary_type);
  6290. exit(1);
  6291. }
  6292. }
  6293. // When dlopen is used dlclose may happen after the COI process
  6294. // is destroyed. In which case images cannot be unloaded and should
  6295. // be skipped. So track if coi has been unloaded.
  6296. static bool coi_may_have_been_unloaded = false;
  6297. extern "C" void __offload_unregister_image(const void *target_image)
  6298. {
  6299. // Target image is packed as follows:
  6300. // 8 bytes - size of the target binary
  6301. // null-terminated string - binary name
  6302. // <size> bytes - binary contents
  6303. const struct Image {
  6304. int64_t size;
  6305. char data[];
  6306. } *image = static_cast<const struct Image*>(target_image);
  6307. // decode image
  6308. const char *name = image->data;
  6309. const void *data = image->data + strlen(image->data) + 1;
  6310. // our actions depend on the image type
  6311. const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data);
  6312. if (hdr->e_type == ET_EXEC) {
  6313. // We are executing exec's desctructors.
  6314. // It is time to do a library cleanup.
  6315. if (timer_enabled) {
  6316. Offload_Timer_Print();
  6317. }
  6318. coi_may_have_been_unloaded = true;
  6319. // Do not unload the MYO library if it loaded in dll.
  6320. if (!__myo_init_in_so)
  6321. {
  6322. #ifdef MYO_SUPPORT
  6323. __offload_myoFini();
  6324. #endif // MYO_SUPPORT
  6325. __offload_fini_library();
  6326. }
  6327. }
  6328. else if ((hdr->e_type == ET_DYN) && !coi_may_have_been_unloaded) {
  6329. for (int i = 0; i < mic_engines_total; i++) {
  6330. mic_engines[i].unload_library(data, name);
  6331. }
  6332. }
  6333. }
  6334. extern "C" void __offload_register_task_callback(void (*cb)(void *))
  6335. {
  6336. task_completion_callback = cb;
  6337. }
  6338. // Runtime trace interface for user programs
  6339. void __offload_console_trace(int level)
  6340. {
  6341. console_enabled = level;
  6342. }
  6343. // User-visible offload API
  6344. int _Offload_number_of_devices(void)
  6345. {
  6346. __offload_init_library();
  6347. return mic_engines_total;
  6348. }
  6349. int _Offload_get_device_number(void)
  6350. {
  6351. return -1;
  6352. }
  6353. int _Offload_get_physical_device_number(void)
  6354. {
  6355. return -1;
  6356. }
  6357. int _Offload_signaled(int index, void *signal)
  6358. {
  6359. __offload_init_library();
  6360. // check index value
  6361. if (index < 0) {
  6362. LIBOFFLOAD_ERROR(c_offload_signaled1, index);
  6363. LIBOFFLOAD_ABORT;
  6364. }
  6365. index %= mic_engines_total;
  6366. // find associated async task
  6367. OffloadDescriptor *task =
  6368. mic_engines[index].find_signal(signal, false);
  6369. if (task == 0) {
  6370. LIBOFFLOAD_ERROR(c_offload_signaled2, signal);
  6371. LIBOFFLOAD_ABORT;
  6372. }
  6373. // if signal is removed by wait completing
  6374. else if (task == SIGNAL_HAS_COMPLETED) {
  6375. return (true);
  6376. }
  6377. return task->is_signaled();
  6378. }
  6379. void _Offload_report(int val)
  6380. {
  6381. if (val == OFFLOAD_REPORT_ON ||
  6382. val == OFFLOAD_REPORT_OFF) {
  6383. offload_report_enabled = val;
  6384. }
  6385. }
  6386. int _Offload_find_associated_mic_memory(
  6387. int target,
  6388. const void* cpu_addr,
  6389. void** cpu_base_addr,
  6390. uint64_t* buf_length,
  6391. void** mic_addr,
  6392. uint64_t* mic_buf_start_offset,
  6393. int* is_static
  6394. )
  6395. {
  6396. __offload_init_library();
  6397. // check target value
  6398. if (target < 0) {
  6399. LIBOFFLOAD_ERROR(c_offload_signaled1, target);
  6400. LIBOFFLOAD_ABORT;
  6401. }
  6402. target %= mic_engines_total;
  6403. // find existing association in pointer table
  6404. PtrData* ptr_data = mic_engines[target].find_ptr_data(cpu_addr);
  6405. if (ptr_data == 0) {
  6406. OFFLOAD_TRACE(3, "Association does not exist\n");
  6407. return 0;
  6408. }
  6409. OFFLOAD_TRACE(3, "Found association: base %p, length %lld, is_static %d\n",
  6410. ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
  6411. ptr_data->is_static);
  6412. if (ptr_data->mic_buf != 0 && ptr_data->mic_addr == 0) {
  6413. COIRESULT res = COI::BufferGetSinkAddress(ptr_data->mic_buf,
  6414. &ptr_data->mic_addr);
  6415. if (res != COI_SUCCESS) {
  6416. return 0;
  6417. }
  6418. }
  6419. *cpu_base_addr = const_cast<void *>(ptr_data->cpu_addr.start());
  6420. *buf_length = ptr_data->cpu_addr.length() - ptr_data->alloc_disp;
  6421. *mic_addr = (void *)(ptr_data->mic_addr + ptr_data->mic_offset);
  6422. *mic_buf_start_offset = ptr_data->alloc_disp;
  6423. *is_static = ptr_data->is_static;
  6424. return ptr_data->is_static ? 1 : ptr_data->get_reference();
  6425. }
  6426. _Offload_stream _Offload_stream_create(
  6427. int device, // MIC device number
  6428. int number_of_cpus // Cores allocated to the stream
  6429. )
  6430. {
  6431. __offload_init_library();
  6432. // check target value
  6433. if (device < 0) {
  6434. LIBOFFLOAD_ERROR(c_offload_signaled1, device);
  6435. LIBOFFLOAD_ABORT;
  6436. }
  6437. device %= mic_engines_total;
  6438. // Create new stream and get its handle
  6439. _Offload_stream handle = Stream::add_stream(device, number_of_cpus);
  6440. if (handle == 0) {
  6441. OFFLOAD_TRACE(3, "Can't create stream\n");
  6442. return 0;
  6443. }
  6444. // create pipeline associated with the new stream
  6445. mic_engines[device].get_pipeline(handle);
  6446. return(handle);
  6447. }
  6448. int _Offload_stream_destroy(
  6449. int device, // MIC device number
  6450. _Offload_stream handle // stream to destroy
  6451. )
  6452. {
  6453. if (Stream::get_streams_count() == 0) {
  6454. LIBOFFLOAD_ERROR(c_offload_streams_are_absent);
  6455. LIBOFFLOAD_ABORT;
  6456. }
  6457. // check target value
  6458. if (device < 0) {
  6459. LIBOFFLOAD_ERROR(c_offload_signaled1, device);
  6460. LIBOFFLOAD_ABORT;
  6461. }
  6462. device %= mic_engines_total;
  6463. mic_engines[device].stream_destroy(handle);
  6464. return(true);
  6465. }
  6466. int _Offload_stream_delete(
  6467. _Offload_stream handle // stream to destroy
  6468. )
  6469. {
  6470. int device; // MIC device number
  6471. Stream * stream;
  6472. if (Stream::get_streams_count() == 0) {
  6473. LIBOFFLOAD_ERROR(c_offload_streams_are_absent);
  6474. LIBOFFLOAD_ABORT;
  6475. }
  6476. stream = Stream::find_stream(handle, false);
  6477. // the stream was not created or was destroyed
  6478. if (!stream) {
  6479. LIBOFFLOAD_ERROR(c_offload_no_stream, device);
  6480. LIBOFFLOAD_ABORT;
  6481. }
  6482. device = stream->get_device();
  6483. mic_engines[device].stream_destroy(handle);
  6484. return(true);
  6485. }
  6486. int _Offload_stream_completed(int device, _Offload_stream handler)
  6487. {
  6488. if (Stream::get_streams_count() == 0) {
  6489. LIBOFFLOAD_ERROR(c_offload_streams_are_absent);
  6490. LIBOFFLOAD_ABORT;
  6491. }
  6492. // check device index value
  6493. if (device < -1) {
  6494. LIBOFFLOAD_ERROR(c_offload_signaled1, device);
  6495. LIBOFFLOAD_ABORT;
  6496. }
  6497. else if (device > -1) {
  6498. device %= mic_engines_total;
  6499. }
  6500. // get stream
  6501. Stream * stream;
  6502. if (handler != 0) {
  6503. stream = Stream::find_stream(handler, false);
  6504. // the stream was not created or was destroyed
  6505. if (!stream) {
  6506. LIBOFFLOAD_ERROR(c_offload_no_stream, device);
  6507. LIBOFFLOAD_ABORT;
  6508. }
  6509. if (device != stream->get_device()) {
  6510. LIBOFFLOAD_ERROR(c_offload_device_doesnt_match_to_stream,
  6511. stream->get_device());
  6512. LIBOFFLOAD_ABORT;
  6513. }
  6514. // find associated async task
  6515. OffloadDescriptor *task = stream->get_last_offload();
  6516. // offload was completed by offload_wait pragma or wait clause
  6517. if (task == 0) {
  6518. return(true);
  6519. }
  6520. return task->is_signaled();
  6521. }
  6522. // zero handler is for all streams at the device
  6523. else {
  6524. StreamMap stream_map = Stream::all_streams;
  6525. for (StreamMap::iterator it = stream_map.begin();
  6526. it != stream_map.end(); it++) {
  6527. Stream * stream = it->second;
  6528. if (device != -1 && device != stream->get_device()) {
  6529. continue;
  6530. }
  6531. // find associated async task
  6532. OffloadDescriptor *task = stream->get_last_offload();
  6533. // offload was completed by offload_wait pragma or wait clause
  6534. if (task == 0) {
  6535. continue;
  6536. }
  6537. // if even one stream is not completed result is false
  6538. if (!task->is_signaled()) {
  6539. return false;
  6540. }
  6541. }
  6542. // no uncompleted streams
  6543. return true;
  6544. }
  6545. }
  6546. int _Offload_stream_is_empty(_Offload_stream handle)
  6547. {
  6548. int device;
  6549. if (Stream::get_streams_count() == 0) {
  6550. LIBOFFLOAD_ERROR(c_offload_streams_are_absent);
  6551. LIBOFFLOAD_ABORT;
  6552. }
  6553. if (handle != 0) {
  6554. Stream * stream = Stream::find_stream(handle, false);
  6555. // the stream was not created or was destroyed
  6556. if (!stream) {
  6557. LIBOFFLOAD_ERROR(c_offload_no_stream, device);
  6558. LIBOFFLOAD_ABORT;
  6559. }
  6560. device = stream->get_device();
  6561. }
  6562. else {
  6563. device = -1;
  6564. }
  6565. // Use 0 for device index as _Offload_stream_completed
  6566. // ignores this value while defining streams completion
  6567. return _Offload_stream_completed(device, handle);
  6568. }
  6569. int _Offload_device_streams_completed(int device)
  6570. {
  6571. if (Stream::get_streams_count() == 0) {
  6572. LIBOFFLOAD_ERROR(c_offload_streams_are_absent);
  6573. LIBOFFLOAD_ABORT;
  6574. }
  6575. // check index value
  6576. if (device < -1) {
  6577. LIBOFFLOAD_ERROR(c_offload_signaled1, device);
  6578. LIBOFFLOAD_ABORT;
  6579. }
  6580. else if (device > -1) {
  6581. device %= mic_engines_total;
  6582. }
  6583. StreamMap stream_map = Stream::all_streams;
  6584. for (StreamMap::iterator it = stream_map.begin();
  6585. it != stream_map.end(); it++)
  6586. {
  6587. Stream * stream = it->second;
  6588. if (device != -1 && device != stream->get_device()) {
  6589. continue;
  6590. }
  6591. // find associated async task
  6592. OffloadDescriptor *task = stream->get_last_offload();
  6593. // offload was completed by offload_wait pragma or wait clause
  6594. if (task == 0) {
  6595. continue;
  6596. }
  6597. // if even one stream is not completed result is false
  6598. if (!task->is_signaled()) {
  6599. return false;
  6600. }
  6601. }
  6602. // no uncompleted streams
  6603. return true;
  6604. }
  6605. // IDB support
  6606. int __dbg_is_attached = 0;
  6607. int __dbg_target_id = -1;
  6608. pid_t __dbg_target_so_pid = -1;
  6609. char __dbg_target_exe_name[MAX_TARGET_NAME] = {0};
  6610. const int __dbg_api_major_version = 1;
  6611. const int __dbg_api_minor_version = 0;
  6612. void __dbg_target_so_loaded()
  6613. {
  6614. }
  6615. void __dbg_target_so_unloaded()
  6616. {
  6617. }