Top Banner
!"#$"#!% !"#$%$&’ )#%$*+,&- !"#$%$&’ .$/’"% 0*&$ )#%$*+,&- 1234)5 634)5 734)8 4,/%"*%/#,’$/’9%$: 3&’$%;$+,*’$ <$’=$$& 2326 *&+ 4346 2/*>*% *&+ 2346 ?*%*>>$> @$/’"%
90

Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

Feb 22, 2023

Download

Documents

Khang Minh
Welcome message from author
This document is posted to help you gain knowledge. Please leave a comment to let me know what you think about it! Share it to your friends and learn new things together.
Transcript
Page 1: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & '

!"#$%$&'()#%$*+,&-

!"#$%$&'(.$/'"%(0*&$()#%$*+,&-1234)5(634)5(734)8

4,/%"*%/#,'$/'9%$:(3&'$%;$+,*'$(<$'=$$&(2326(*&+(4346

2/*>*%(*&+(2346(?*%*>>$>(@$/'"%

Page 2: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & $

6,:/>*,;$%

)#$(/"&'$&'("A('#,:(?%$:$&'*',"&(,:(;B(?$%:"&*>("?,&,"&("&>BC

)#$(:'*'$;$&':(3(;*D$(#$%$(,&(&"(=*B(%$?%$:$&'(;B($;?>"B$%E:(?":,',"&5&"%(*;(3(*9'#"%,F$+('"(:?$*D("&(G$#*>A("A(;B($;?>"B$%C

)#,:(,:(&"'(*(?%"+9/'(*&&"9&/$;$&'5(&"%(*&(*&&"9&/$;$&'("A(?%"+9/'(+$'*,>:C(7"%($@$&(*&(*&&"9&/$;$&'("A(*(%$:$*%/#(+,%$/',"&

H>'#"9-#(3(*;(*&($;?>"B$$(I(/9%%$&'>B("A(3&'$>5(,&('#$(?*:'("A("'#$%(/";?9'$%(/";?*&,$:(:9/#(*:(H465(4"'"%">*5(*&+(J"9>+(I(3(%$@$*>('#,:("&>B(:"('#*'('#$(%$*+$%(;*B(*//"9&'(A"%(*&B(?"::,G>$(G,*:(3(;*B(#*@$('"=*%+:(;B($;?>"B$%E:(?%"+9/':C

3&(A*/'5('#,:(?":',&-(;*B(&"'($@$&(%$?%$:$&'(;B(?$%:"&*>("?,&,"&5:,&/$("//*:,"&*>>B(3(?>*B(+$@,>E:(*+@"/*'$C

Page 3: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & (

)$%;,&">"-B

! ),;$(?,?$>,&$+(@$/'"%:(K($C-C(!%*BL

! 2346(?*%*>>$>(@$/'"%:(K($C-C(!"&@$M5($C-C(6N!()*%*&'9>*5($C-C(0O<

! )#%$*+(K(:'*&+*%+(:"A'=*%$('$%;(K(P!(Q(2P(Q(%$-:C((49>',?>$('#%$*+:(,&(:*;$(@,%'9*>(*++%$::(:?*/$C

! 3&'%*I>*&$("%(!%"::(0*&$(0*'$&/B()">$%*&'5()#%$*+:

" !";;"&>B(/*>>$+(R,&$(J%*,&(49>','#%$*+,&-S(34TU(*(#"%%,G>B(;,:>$*+,&-('$%;5($C-C(<9%'"&(2;,'#(TNP('#%$*+:

" $C-C(24)5((V;'(W(7#;(T)5(0O<('#%$*+:5

" 2"N4)5(3&'$%>$*@$+(4)5(',;$:>,/$+('#%$*+:

! .$/'"%(0*&$()#%$*+,&-(K(J>$=,:;CCC(!"#$%$&'(.$/'"%(0*&$()#%$*+,&-(CCCC(!"#$%$&'()#%$*+,&-CCC

(K(X"FB%*D,:(1<$%D$>$B8(Y

! Z&A"%'9&*'$>B5('#$,%(.0)(+,AA$%$&'('#*&(J>$=(9:$("A(.0)

" 234)(K(2,&->$(3&:'%9/',"&(49>',?>$()#%$*+:(K(7@,+,*('$%;,&">"-B

" 634)(K(69*>(3&:'%9/',"&(49>',?>$()#%$*+:(K($C-C(3&'$>(J$&Y

" 734)(K(7(,&:'%9/',"&:(4('#%$*+:(K(J>$=,:;

!

! !"#$%$&'(P*%*>>$>()#%$*+(J%"9?,&-

" 7@,+,*([=*%?\(1"A('#%$*+:8

" 3&'$>(J$&](['#%$*+\(1"A(/#*&&$>:8

" H)3([=*@$A%"&'\

! .$/'"%(>*&$('#%$*+

" 7@,+,*(['#%$*+\

" 3&'$>(J$&]([/#*&&$>\

! 0*&$(^(*>,-&$+($>$;$&'("A(2346(@$/'"%(+*'*?*'#

" [P#B:,/*>(0*&$\(I(*:(*G"@$5(*/'9*>>B(,;?>$;$&'$+

" [.,%'9*>(0*&$\(I(&$=(J>$=,:;5(:'9AA(*::"/,*'$+(=,'#(*>,-&$+($>$;$&':("A(*(2346(@$/'"%(+*'*?*'#5(?"::,G>B(',;$(;9>',?>$M$+("@$%(;9>',?>$(/>"/D(/B/>$:

! $C-C(H:*&"@,/E:(.,%'9*>(P%"/$::"%:

! !"#$%$&/B(^(JPZ('$%;5(:,;,>*%,'B(G$'=$$&('#%$*+:C7U)(/*/#$("%(;$;"%B(/"&:,:'$&/BW/"#$%$&/BC

)$%;,&">"-B(,&('#,:(?%$:$&'*',"&(7U)(/"&:,:'$&'5(%$A>$/':($@">9',"&W*//9;9>*',"&C((

Page 4: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & )

2346(.$/'"%:

)#%$*+W!"%$WP%"/$::"%(P*%*>>$>,:;

L()L(/"%$

L___:

L__:

L_:

2/*>*% `aG LbcGdLbGL`M2P

O$*>>B(V,+$

0O</#,?

0O</"%$

7T;

7.

7.2P

P` P33

3&'$>J$&

H)3

H)32P

Pa

4%;!"%$

234)(2/*>*%

234).03V

634).$/

H>?#*)*%*&'9>*

4346

H>?#*TP!

!"#$%$&'(.$/'"%(0*&$()#%$*+,&-

3:"(TV(/":'

3:"(P"=$%

((((((((((((3:"(!";?>$M,'B

!4IL3>>,*/3.

Page 5: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & *

JPZ(2#*+$%(N&-,&$:*%$(7U)(

e"9%(R*;,>BE:(.$/'"%(P%"/$::"%

7$,'#$%(Z&/>$(2$B;"9%(1!%*B8.$/'"%(),;$(P,?$>,&$+

7"%(Z&/>$(2'$@$(1V*>>*/#8.$/'"%(P*%*>>$>

H&+B(J>$=

Page 6: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & +

"%CCC

T"=('"(9:$(JPZI>,D$!"#$%$&'(.$/'"%(0*&$()#%$*+,&-1*D*(234)5(*D*(634)5(*D*(734)8'"(,&/%$*:$(@$/'"%(H0Z($AA,/,$&/B

"@$%(2346(@$/'"%(?*%*>>$>;,/%"*%/#,'$/'9%$:

Page 7: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ,

H-$&+*! !"#$%&'(#)%"

" *+,-.%)"#-%/-#+)0-.$,0,"#1#)%"

" 2(131$4-5!564-2!56

" 575*-89-%:#);)<1#)%"0-=-2!5*>

" 2!5*-8(3100)(-0#1(?>4-6!5*4-@!5*4-

7%+,$,"#-A,(#%$-B1",-*+$,1&)"C

! !",//)(),"(,0-)"-2!56-:1$133,3-D,(#%$-:$%(,00%$0

" AB-E-A;10?-E-B1",-($%00)"C

" *);,-:):,3)",&-D,(#%$0-F-AB-F-2!5*-)/,#(+

" G'"&1;,"#13-2!5*-1&D1"#1C,-H-:$,&)(1#)%"

" B%%:-I'//,$J-2!56- -2!5*- -5!56! !

" K,L)CC,$)"C-*+$,1&0-)"-M1$:

" 5%$,-$,I131"()"C

! 7%"(3'0)%"0

" 2';;1$N-%/-2!5*-A.O-'#)3)<1#)%"-);:$%D,;,"#0

" *%:-PQ-$,10%"0-R+N-2!5*-7AB*-I,##,$-#+1"-2!56

! S1(?':

" !"#$%&'(#)"'%" *+,-.#/0/!" 1,/2#"(#1,/!333

" 2&4&#5&44%'(#5"6%'

" 1,/!#'%7894%'#:8$%9! 15%%;.#98<%! *&'8&=$%#>$'%79! -8'%9! ?%@877%'8(7

" A'%#1,/!#4B'%&;9#$&4%(CD#4"$%'&(4#4B'%&;9EA'%#1,/!#4B'%&;9#'%&$$D#4B'%&;9#&4#&$$E

" FGH#"(#1,/!" 1,/!#,1A#%I4%(98"(9E

" ?%$8&=8$84D.#+"CJ94%5

" ,(94'KC48"(#&(;#2&4&#0"B%'%(C%,(4'&#L93#,(4'&#0&CB%$8(%#!B'%&;8(7

" -BD#1,/!#89#:'8%(;$8%'#4"#9":46&'%

" 14'&6)&(#/8C'"&'CB84%C4K'%

" !%')8("$"7D

Page 8: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & "

4"+$%&(JPZ:(9:$(;*&B(Z*%/#()$/#&,f9$:

! 4*&B([?%"/$::"%(/"%$:\

! <*%%$>(?%"/$::,&-

" <9%'"&(2;,'#(TNP(:'B>$('#%$*+:(^(#,+$(>*'$&/,$:5(*@",+(,&'$%>"/D:

! 2"N4)('#%$*+,&-

! !"#$%&'()*"'+,&")-.*/'0.$,.*'$,"'#%&"1

" )2)'34567'345+7'888

" 9:';"&'<"#$%&'=)*"

! !>9:'$%,&-(=,'#,&('#$([@$/'"%\(=,+'#

" $C-5(aMgb(K(LbcG(/>9:'$%:5(A"%(gb(=,+$(2346

! .$/'"%I',;$I?,?$>,&,&-

! .$/'"%I=,+'#I?*%*>>$>,F*',"&

! H)3h(.03V(=,'#,&(*(@$/'"%(>*&$

Page 9: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & %

)#$(P",&'("A('#,:(P%$:$&'*',"&! )B?,/*>(JPZ:(9:$

!"#$%$&'(.$/'"%(0*&$()#%$*+,&-5(*D*(234)5(*D*(2346

! 4*&B(?$"?>$(:$$;('"(G$>,$@$('#,:($f9,@*>$&'('"([/"&@$&',"&*>\(@$/'"%(?%"/$::"%:

! 7U)(B"9%(R*'#$%E:(.$/'"%(P%"/$::"%

" 234)(,:(&"'(i9:'(@$/'"%:(=,'#(;*:D(*&+(-""+(:/*''$%W-*'#$%

" O$+9/$:(D$B(:"9%/$:("A(,&$AA,/,$&/B(,&(P*%*>>$>(.$/'"%(P%"/$::"%:

! !"#$%$&'(.$/'"%(0*&$()#%$*+,&-(,:('"(P*%*>>$>(.$/'"%(P%"/$::"%:=#*'(U9'I"AIU%+$%(NM$/9',"&(,:('"(.03V

" 4,/%"*%/#,'$/'9%$5(&"'(4*/%"*%/#,'$/'9%$

" 2*;$([32H(!"&/$?'\(%9&:("&(:/*>*%(4346

" H::9;,&-(B"9(W$&/"9%*-,&-(B"9('"(#*@$(>"':("A('#%$*+:

Page 10: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & '!

j9,/D(NM*;?>$:("A(NAA$/':"A(234)W634)W734)

!"#$%$&'(.$/'"%(0*&$()#%$*+,&-U?',;,F*',"&:("&(.PZ(Z',>,F*',"&

k9:'(>""D(*'('#$(?,/'9%$:CV$E>>(:$$('#$;(*-*,&(>*'$%5(=,'#($M?>*&*',"&C

Page 11: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ''

P%$+,/*',"&($M$/9'$:(*>>(?*'#:S(234)("&>B($M$/9'$:(,&:'%9/',"&:(

9:$+(*'(>$*:'("&/$

:=,'/#1A1,88(l/*:$(_h(3_S(G%$*DS/*:$(Lh(3LS(G%$*DSm/*:$(nh(3nS(G%$*DSo

0*&$(_)#%$*+(_

0*&$(L)#%$*+(L

0*&$(b)#%$*+(b

0*&$(g)#%$*+(g

_

b

d

n

V*:'$+

V*:'$+

0*&$(_)#%$*+(_

0*&$(L)#%$*+(L

0*&$(b)#%$*+(b

0*&$(g)#%$*+(g

L

b

d

n

0$::(V*:'$

0$::(V*:'$

Page 12: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & '$

V#B(&"'(*(>""?(G9AA$%(*'($*/#(>*&$Y

0*&$(_)#%$*+(_

3_

0*&$(L)#%$*+(L

0*&$(b)#%$*+(b

0*&$(_)#%$*+(_

0*&$(L)#%$*+(L

0*&$(b)#%$*+(b

3_A"%(*>>('#%$*+:(3(((%$?$*'(L__((((((A"%(*>>(@$/'"%($>$;$&':(i(((((((((3Lh(,A(/"&+p,5iq('#$&((((((((((((3bh(@piq(K(CCC3g

V#$%$(/"&+p,5iq(K((((*%G,'%*%B5($C-CL_LLL_____LL((((

H::9;,&-(/"&+p,5iq(,:(7U)(*@*,>*G>$('",&:'%9/',"&(:$f9$&/$%

3LCL3bCL3LCb3bCb3LCg3bCg3LCa3bCa3g

3_3LCL3bCL3LCb3bCb3LCg3bCg3LCa3bCa3g

3_3LCL3bCL3LCb3bCb3LCg

3LCa3bCa3g

3_3LCL3bCL3LCb3bCb3LCg3LCa

3g

3_3LCL3LCb3bCb3LCg3LCa3bCa

3g

3_3LCL3bCL3LCb3LCg

3LCa3bCa3g

3bCg3bCg

V*,'(9?3A$'/#$:(:*@$+

Page 13: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & '(

O$i,--$%,&-()#%$*+:(G$'=$$&(!B/>$:("A(*(;9>',I/B/>$(V*%?

3L%$?$*'(,K_('"(CCC((((A"%('#%$*+:('K_('"(n

(,A($@$&1,8(]UO($@$&(1'8((('#$&(3b(((($>:$(3g

3a

0*&$(_)#%$*+(_

'_

0*&$(L)#%$*+(L

0*&$(b)#%$*+(b

0*&$(g)#%$*+(g

'L 'b 'g

U%,-,&*>5(3;G*>*&/$+(K(d_r(9',>,F$+

U9%(V*%?U'#$%(V*%?:(CCC

'd 'a 'n '`

,b,bU'#$%(V*%?:(CCC,g,gU'#$%(V*%?:(CCC,b,bU'#$%(V*%?:(CCC,g,g

'L 'g'd 'n

'_ 'b'a '`

'L 'g'd 'n

'_ 'b'a '`

'_ 'L 'b 'g

O$i,--$%$+(=,'#,&(V*@$A%"&'(K(L__r(9',>,F$+

U9%(V*%?U'#$%(V*%?:(CCC

'd 'a 'n '`

U'#$%(V*%?:(CCC

U'#$%(V*%?:(CCC

U'#$%(V*%?:(CCC

'L 'g'd 'n'_ 'b'a '`,g

,b

'L 'g'd 'n'_ 'b'a '`,b

,g

'L 'g'd 'n'_ 'b'a '`,b

,g

NM*;?>$(*::9;$:)#%$*+(:D$=,&-(G$'=$$&(=*@$:(,&(*(=*@$A%"&'

Page 14: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ')

234)(+,:?*'/#(+9%,&-,+>$(,A$'/#(=#,>$

),;$(P,?$>,&$+(.$/'"%:*%$(G9:B

3LA"%(*>>(,

,A("++((('#$&(3b((($>:$(3g$&+,A

$&+(>""?3a

0*&$(_)#%$*+(_

3g

3a

3L

0*&$()#%$*+(L

3b

3a

3L

0*&$(b)#%$*+(b

3g

3a

3L

0*&$(g)#%$*+(g

3b

3a

3L

V*:'$+

V*:'$+

V*:'$+

V*:'$+

0*&$(_)#%$*+(_

3g

3a

3L

0*&$(L)#%$*+(L

3b

3a

3L

0*&$(b)#%$*+(b

3g

3a

3L

0*&$(g)#%$*+(g

3b

3a

3L

V*:'$+ V*:'$+

V*:'$+ V*:'$+

0*&$(_)#%$*+(_

3g

3a

3L

0*&$(L)#%$*+(L

3b

3a

3L

0*&$(b)#%$*+(b

3g

3a

3L

0*&$(g)#%$*+(g

3b

3a

3L

V*:'$+ V*:'$+

V*:'$+ V*:'$+

2346h(aWc(K(L`Wgb(K(d_r(,+>$(,&&$%(>""?

234)Qh(aWb_(K(b_r(,+>$(

gnCdr(:?$$+9?

Page 15: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & '*

N&+("A(NM*;?>$:C

<*/D('"(NM?>*&*',"&:C

V$E>>(:$$('#$(?,/'9%$:(>*'$%C

Page 16: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & '+

?/

,s

+

:

M

?/

%A

M

?/

%A

?/ ?/ ?/

s

M

%A

M

%A

M

%A

M

%A

M

%A

M

%A

M

%A

M

%A

M

%A

M

%A

CCC

?/

,s

+

:

M

?/

%A

M

?/

%A

?/ ?/ ?/

s

M

%A

M

%A

M

%A

M

%A

M

%A

M

%A

M

%A

M

%A

M

%A

M

%A

CCC

?/

,s

+

:

M

?/

%A

M

?/

%A

?/ ?/ ?/

s

M

%A

M

%A

M

%A

M

%A

M

%A

M

%A

M

%A

M

%A

M

%A

M

%A

CCC

?/

,s

+

:

M

?/

%A

M

?/

%A

?/ ?/ ?/

s

M

%A

M

%A

M

%A

M

%A

M

%A

M

%A

M

%A

M

%A

M

%A

M

%A

CCC

4$;"%B(:#*%$+(G$'=$$&(;9>',?>$(.0)(?%"/$::"%:

49>',?>$(.0)(?%"/$::"%:

4"+$%&(JPZ:((9:$(;*&B(Z*%/#('$/#&,f9$:

!""#$%&'#!"#()*+,!#+&#"&)#(-%.+&/

Page 17: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ',

4"+$%&(JPZ:(9:$(;*&B(Z*%/#('$/#&,f9$:

! R,M$+(R9&/',"&(0"-,/

! 4*&B(?%"/$::"%(/"%$:

! 49>',?>$('#%$*+:(A"%(>*'$&/B('">$%*&/$

! 29?$%?,?$>,&,&-

! 2346(@$/'"%(?*%*>>$>,:;

! .$/'"%(>*&$('#%$*+,&-(1*D*(234)5(634)5(734)8

! !>9:'$%:("A($M$/9',"&(9&,':

! .*%,*G>$(t(%$-,:'$%:(?$%('#%$*+(1'"(:9??"%'(;"%$('#%$*+:(,&(:*;$(A,M$+(:,F$(%$-,:'$%(A,>$8

! 2/*''$%(-*'#$%(*&+(:'%,+$+(*//$::(,&('#$(%$-,:'$%(A,>$

! 2/*''$%(-*'#$%(*&+(:'%,+$+(*//$::('"(;$;"%B

! !*/#$:(m("%(&"'

!""#$%&'#!"#()*+,!#+&#"&)#01+()

Page 18: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & '"

4"+$%&(JPZ:(9:$(;*&B(Z*%/#('$/#&,f9$:

2"(>$'E:(>""D(*'(:";$(9*%/#('$/#&,f9$:,&(,:">*',"&CCC

Page 19: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & '%

,&'(*>9(1`aG8

A?(*>9(1`aG8

?%"-%*;(/"9&'$%

,&:'%9/',"&(:'"%$(W(3s

,&'(+$/"+$

%$*+B(W(:/#$+9>$

%$-,:'$%(A,>$

+*'*(;$;"%B(W(/*/#$

2/*>*%(P%"/$::"%

Page 20: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & $!

M

?/

,s

+

:

%A

+s

M

?/

,s

+

:

%A

+s

M

?/

,s

+

:

%A

+s

M

?/

,s

+

:

%A

+s

M

?/

,s

+

:

%A

+s

M

?/

,s

+

:

%A

+s

2#*%$+(4$;"%B

4346(P*%*>>$>(P%"/$::"%

Page 21: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & $'

@?9

?/

,s

+

:

+s

M M M M M

4

.$/'"%WP*%*>>$>([2346\(4,/%"?%"/$::"%

/%"::(@$/'"%(>*&$("?$%*',"&:5($'/C

; ; ; ; ; ;

CCCM

%A

@%A((1$C-C(LbcG(=,+$(A"%(22N8

Page 22: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & $$

?/

,s

+

M

:

%A

0_(+s

4"%$(s(T,$%*%/#B(*&+(4$;"%B

49>',/>9:'$%(49>','#%$*+,&-

M

:

%A

0_(+s

2#*%$+(0L(+s

RPY4!4)(%$?>,/*'$:('#$(10_8(6s5('"(%$+9/$('#%*:#,&-(G$'=$$&('#%$*+:

)#$(%$:'("A('#$(/%,',/*>(>""?(I(:/#$+9>$%5(H0Z:5(?"::,G>B(OR(^(,:(%$?>,/*'$+('"(*@",+((,&/%$*:,&-(/%,',/*>(>""?(

>*'$&/B

3s(:#*%$+(G$'=$$&('#%$*+:h*8(>*'$&/B('">$%*&'5G8('%9$(:#*%,&-

4!4)("&>B(+,:?*'/#$:(*&(,&:'%9/',"&('"(*(

:,&->$(/>9:'$%(W('#%$*+

<9'(,A(G"'#('#%$*+:(*%$($M$/9',&-('#$(:*;$(,&:'%9/',"&5

=#B(&"'(:$&+('"(G"'#Y372)OZ!)3U7!UTNON7!N

2,;,>*%>B5(,A(G"'#('#%$*+:(*%$(+IA$'/#,&-(:*;$(/*/#$(>,&$5(=#B(

&"'(/";G,&$Y6H)H(!UTNON7!N

<$A"%$('*>D,&-(*G"9'(234)5(*&($M*;?>$(/>":$%('"(<,-(!"%$(;*,&:'%$*;

Page 23: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & $(

U&$(V*B("A(0""D,&-(*'(3'

.$/'"%(0*&$()#%$*+,&-1234)5(634)5(734)8

K49>',/>9:'$%(49>','#%$*+,&-

Q"?',;,F*',"&:('"('*D$(*+@*&'*-$

"A(3&:'%9/',"&(!"#$%$&/$*&+(6*'*(!"#$%$&/$

Page 24: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & $)

@?9

?/

,s

+

:

+s

M M M M M

4

.$/'"%WP*%*>>$>([2346\(1A"%(f9,/D(/";?*%,:"&8

/%"::(@$/'"%(>*&$("?$%*',"&:5($'/C

; ; ; ; ; ;

CCCM

%A

@%A((1$C-C(LbcG(=,+$(A"%(22N8

Page 25: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & $*

?/

,s

+

:

M

?/

%A

+s

M

?/

%A

+s

M

?/

%A

+s

M

?/

%A

+s

M

?/

%A

+s

4$;"%B(W(s(:#*%$+(G$'=$$&(.$/'"%(0*&$([P%"/$::"%:\

.$/'"%(0*&$()#%$*+,&-(1*D*(234)8

2$>$/'(L(P!(?$%(/B/>$

R$'/#(L(,&:'%9/',"&(?$%(/B/>$

2";$(>*&$:($M$/9'$(,&:'%9/',"&

2";$(+"&E'

Page 26: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & $+

!>*::,/(234)(2'*/D(H>-"%,'#;

?/,s+:

M

?/

%A

+s

M

?/

%A

M

?/

%A

M

?/

%A

4

! 3&,',*>>B5(*>>(P!:(,&(*(=*%?(*%$($f9*>

! NM$/9'$(P!:(9&+$%(;*:D

" 3A(+,@$%-,&-(13R8

! P9:#('*%-$'(P!5(;*:D

" 3A(/"&@$%-,&-(1N763R5(N760UUP5(ON)ZO78

! 3A(;*'/#$:()U2

! P"?()U2C1P!5;*:D8

" 3A(:=,'/#,&-(1N02N8

! 3A(;*'/#$:()U2

! P"?()U2C1P!5;*:D8

! V"%:'(/*:$h(

" L(:'*/D($&'%B(?$%(@$/'"%(>*&$u(&9;G$%("A(/"&@$&',"&*>(#"%,F"&'*>('#%$*+:

" <9'("&>B("&$(P!(*&+("&$()U2($&'%B(&$$+(G$($M*;,&$+(*'(*(',;$C

" 7"(!H4:C

?/ ;*:D?/ ;*:D?/ ;*:D?/ ;*:D

:'*/D

H%/#,'$/'9%*>('#%$*+(P!:

<$''$%(*>-"%,'#;:($M,:'C(2$$5($C-C(R9&-

Page 27: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & $,

?/

,s

+

:

M

?/

%A

+s

M

?/

%A

+s

M

?/

%A

+s

M

?/

%A

+s

M

?/

%A

+s

4$;"%B(W(s(:#*%$+(G$'=$$&(.$/'"%(0*&$([P%"/$::"%:\

.$/'"%(0*&$()#%$*+,&-(I(b(1*D*(634)8

R$'/#(b(+,AA$%$&'(,&:'%9/',"&:(?$%(/B/>$

2";$(H0Z:

$M$/9'$(L:'(,&:'%9/',"&

2";$(H0Z:

$M$/9'$(b&+((,&:'%9/',"&

2";$(H0Z:

$M$/9'$(&"(,&:'%9/',"&

Page 28: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & $"

?/

,s

+

:

M

?/

%A

+s

M

?/

%A

+s

M

?/

%A

+s

M

?/

%A

+s

M

?/

%A

+s

4$;"%B(W(s(:#*%$+(G$'=$$&(.$/'"%(0*&$([P%"/$::"%:\

.$/'"%(0*&$()#%$*+,&-(^(7734)(K(7(,&:'%9/',"&:5(4('#%$*+:5(7(v(4

Page 29: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & $%

2'*'$("A('#$(H%'

! 7@,+,*

" J'bc_h(234)12/*>*%85(:'%,/'>B(:/*>*%(=,'#,&(>*&$:

" J'g__h(YY(4P465([4346I,:#\(YY

! H)3(nn__

" 234)1.03V8h(c(>*&$:5(>*&$:(K(d(=,+$(.03V

! 0O<

" dLb(G,'(1`a<5(L`MgbG8(2346

Page 30: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & (!

!";?*%,&-(2346(@$/'"%(?*%*>>$>'"(234)(@$/'"%(>*&$('#%$*+$+

@?9

?/,s+:

+s

M M M M M

4

/%"::(@$/'"%(>*&$("?$%*',"&:5($'/C

; ; ; ; ; ;CCCM

%A@%A((1$C-C(LbcG(=,+$(A"%(22N8

?/,s+:

M

?/

%A

+s

M

?/

%A

M

?/

%A

M

?/

%A

M

?/

%A

4

Page 31: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ('

)#$(P",&'("A('#,:(P%$:$&'*',"&! )B?,/*>(JPZ:(9:$

.$/'"%(0*&$()#%$*+,&-5(*D*(234)5(*D*(2346

! 4*&B(?$"?>$(:$$;('"(G$>,$@$('#,:($f9,@*>$&'('"([/"&@$&',"&*>\(@$/'"%(?%"/$::"%:

! 7U)(B"9%(R*'#$%E:(.$/'"%(P%"/$::"%

" 234)(,:(&"'(i9:'(@$/'"%:(=,'#(;*:D(*&+(-""+(:/*''$%W-*'#$%

" O$+9/$:(D$B(:"9%/$:("A(,&$AA,/,$&/B(,&(P*%*>>$>(.$/'"%(P%"/$::"%:

! .$/'"%(0*&$()#%$*+,&-(,:('"(P*%*>>$>(.$/'"%(P%"/$::"%:=#*'(U9'I"AIU%+$%(NM$/9',"&(,:('"(.03V

" 4,/%"*%/#,'$/'9%$5(&"'(4*/%"*%/#,'$/'9%$

" 2*;$([32H(!"&/$?'\(%9&:("&(:/*>*%(4346

" H::9;,&-(B"9(W$&/"9%*-,&-(B"9('"(#*@$(>"':("A('#%$*+:

Page 32: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ($

3&$AA,/,$&/,$:(,&(P*%*>>$>(.$/'"%(P%"/$::"%:

! .$/'"%(0$&-'#

! !"&+,',"&*>:(W(.$/'"%(4*:D,&-

! .$/'"%(0*&$(!%"::,&-

Page 33: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ((

234)(K(&"(.$/'"%(0*&$(!%"::,&-! .$/'"%(3&:'%9/',"&(:$':(*>=*B:(=*&'([,&'%*I>*&$\("?$%*',"&:

" 6"'(?%"+9/'h(.LLu.b

L(Q(CC(.L

7u.b

7

" O$/9%%$&/$:h(

! HLK<

L

! HbK<

LQ<

b

! HgK<

LQ<

bQ<

g

! !%"::,&-(@$/'"%(>*&$:(G,-(:"9%/$("A(/";?>$M,'B

" T*%+=*%$(/";?>$M,'Bh(=,%$:5(;9M$:

" 32H(/";?>$M,'Bh(,&:'%9/',"&(?%">,A$%*',"&

! 234)(+"$:&E'(&$$+(@$/'"%(>*&$(/%"::,&-(,&:'%9/',"&:5(G$/*9:$(+,AA$%$&'(>*&$:(*%$(A%";(+,AA$%$&'('#%$*+:C

Page 34: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ()

P*%*>>$>(.$/'"%(0$&-'#(V*:'*-$

3L(1:/*>*%8

3b(1@$/'"%8.0KLd $_ $Ld$b$L m(((((((((((((((((((CCC

3g(1@$/'"%8.0KLb $_ $Ld$b$L m( $La$Lg$Lb

V*:'$+

V*:'$+

V*:'$+

3d(1@$/'"%8.0KLd $_ $Ld$b$L m(((((((((((((((((((CCC

3`(1@$/'"%8.0KLb $_ $Ld$b$L m( $La$Lg$Lb

V*:'$+

V*:'$+

V*:'$+

3a(1:/*>*%8

Page 35: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & (*

.$/'"%(),;$(P,?$>,&,&-($>,;,&*'$:(.$/'"%(0$&-'#(V*:'*-$

3L'K_

3b($_'KL

(((($L'Kb

((CCCCCC

((($Ld'KL`

3g($_'KLn

(((($L'KLc

((CCCCCC

((($Lb'Kbw

3a($_CCC

(((($L((CCC((($Ld

3d($_(((($L((CCC((($Lb

7"(=*:'$+(NZI(9&9:$+(@$/'"%(>$&-'#(:D,??$+

!":'h(>*'$&/B

NC-C(!%*BIL(=*:(7U)(*(?*%*>>$>(;*/#,&$h,'(=*:(*(?,?$>,&$+(@$/'"%(;*/#,&$1=,'#(/#*,&,&-8

Page 36: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & (+

2/*>*%(1*&+(2/*>*%(43468*>:"($>,;,&*'$(.

0(=*:'*-$

3L__'K_

3b__'KL

3b_L'Kb

((CCCCCC

3bLd'KL`

3g__'KLn

3g_L'KLc

((CCCCCC

3gLb'Kbw

3a__CCC

3a_L((CCC3aLd

3d__

3d_L((CCC(((3dLb

7"(=*:'$+(NZI(9&9:$+(@$/'"%(>$&-'#(:D,??$+

!":'h(>*'$&/BQ(,&:'%9/',"&("@$%#$*+

Page 37: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & (,

.$/'"%(),;$(P,?$>,&,&-(="%D:(=,'#(234)(.$/'"%()#%$*+,&-

3L'K_

3b($_'KL

(((($L'Kb

((CCCCCC

((($Ld'KL`

3g($_'KLn

(((($L'KLc

((CCCCCC

((($Lb'Kbw

3a($_CCC

(((($L((CCC((($Ld

3d($_(((($L((CCC((($Lb

3L

3b($_(((($L

((($Ld

3g($_(((($L

((($Lb

3a($_(((($L

((($Ld

3d($_(((($L((CCC((($Lb

3L

3b($_(((($L

((($Ld

3g($_(((($L

((($Lb

3a($_(((($L

((($Ld

3d($_(((($L((CCC((($Lb

0*&$(_)#%$*+(_

0*&$(gL)#%$*+(gL

m

CCC

0*&$(L)#%$*+(L

((CCC ((CCC

((CCC

((CCC((CCC

((CCC

Z&/>$*%(#"=(;9/#($M,:',&-(JPZ:(+"('#,:C

H??*%$&'>B(:#"%'(@$/'"%:(1b("%(a(',/D:(>"&-C8

4*&B(1G9'(&"'(*>>5(&"'($@$&(;":'8("A('#$:$(:>,+$:(*;"9&'('"(:#"=,&-(#"=(

$.>"';.;"=.*"-'<"#$%&'.*1$&?#$.%*1("&(*'345+'<"#$%&'=)*"'$,&")-"-'>.#&%)&#,.$"#$?&"(:">@$(;*&B(9',>,F*',"&(?%"G>$;:C

<9'(+"&E'(="%%B5('#$%$E:("'#$%(&$*'(:'9AAC

Page 38: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ("

.$/'"%(4*:D(V*:'*-$

3L(1:/*>*%8

3b(1@$/'"%8.0KLd $_ $b$L m(((((((((((((((((((CCC

3g(1@$/'"%8.0KLb

$_ $Ld$b$L m( $La$Lg$LbV*:'$+

V*:'$+

V*:'$+

V*:'$+

V*:'$+

$LaV*:'$+

$LbV*:'$+

$Ld$Lg

3a(1:/*>*%8

3LA"%(*>>(,

,A("++('#$&(3b($>:$(3g3a

Page 39: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & (%

2/*>*%(2,;?>$(2346(!"&+,',"&*>(V*:'*-$

3LA"%(*>>(,

,A("++((('#$&(3b((($>:$(3g$&+,A

$&+(>""?3a

0*&$(_)#%$*+(_

3g

3a

3L

0*&$()#%$*+(L

3b

3a

3L

0*&$(b)#%$*+(b

3g

3a

3L

0*&$(g)#%$*+(g

3b

3a

3L

V*:'$+

V*:'$+

V*:'$+

V*:'$+

Page 40: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & )!

4*&B(?$"?>$(:*BCCC

6"$:&E'('#,:(;$*&'#*'(234)W2346W.$/'"%(0*&$()#%$*+,&-

3:(&"(;"%$($AA,/,$&'('#*&V,+'#(P*%*>>$>(.$/'"%:(=,'#(.$/'"%(4*:D:

Y

T$>>(7"x)#$%$(*%$(>"':("A(=*B:('"(%$+9/$(2346(/"&+,',"&*>(=*:'*-$

Page 41: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & )'

2346(!"&+,',"&*>(V*:'*-$=,'#(),;$(P,?$>,&$+(.$/'"%:

3LA"%(*>>('#%$*+:(,

,A("++((('#$&(3b((($>:$(3g

3a

0*&$(_)#%$*+(_

3g

3a

3L

0*&$(L)#%$*+(L

3b

3a

3L

0*&$(b)#%$*+(b

3g

3a

3L

0*&$(g)#%$*+(g

3b

3a

3L

V*:'$+ V*:'$+

V*:'$+ V*:'$+

3&:'%9/',"&R$'/#$:

Page 42: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & )$

O$+9/$+(2346(!"&+,',"&*>(V*:'*-$=,'#(),;$(P,?$>,&$+(.$/'"%:

3LA"%(*>>('#%$*+:(,

,A("++((('#$&(3b((($>:$(3g

3a

0*&$(_)#%$*+(_

3g

3a

3L

0*&$(L)#%$*+(L

3b

3a

3L

0*&$(b)#%$*+(b

3g

3a

3L

0*&$(g)#%$*+(g

3b

3a

3L

V*:'$+ V*:'$+

V*:'$+ V*:'$+

!"#$%&'$(')*'$+%('*,-'+&%$./'-"$0$(')*'$+%('*,-'+&%($&%$+12/$2)%/($3"+2/$4,(#$/5/-,'+%6$7,2'+-#-2/$'+7/$8+8/2+%/1$9/-'&*$+%('*,-'+&%(:

;<//$=/)2!&*21>/-"?-&7@

Page 43: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & )(

),;$(P,?$>,&$+(.$/'"%:*>:"(#$>?(.$/'"%(4*:D,&-(

=,'#,&(0*&$3LA"%(*>>('#%$*+:(,(((A"%(*>>(@$/'"%($>$;$&':(i((((((,A(/"&+1i8('#$&(((((((((@piq(K(CCC3a

V#$%$(/"&+piq(K((((*%G,'%*%B5($C-C((((l(_5L5L5__5L5_opiq

H::9;,&-(/"&+piq(,:(*@*,>*G>$('",&:'%9/',"&(:$f9$&/$%

0*&$(_)#%$*+(_

@pbq@pdq

@pLq

0*&$(L)#%$*+(L

0*&$(b)#%$*+(b

0*&$(g)#%$*+(g

@pbq@pdq

@pLq@pbq@pdq

@pLq@pbq@pdq

@pLq

7U)

@p_q@pLq@pbq@pgq@paq@pdq@p`q@pnq

3LA"%(*>>('#%$*+:(,(((A"%(*>>(@$/'"%($>$;$&':(i((((((,A(/"&+1i8('#$&(((((((((@piq(K(CCC3a

V#$%$(/"&+piq(K((((*%G,'%*%B5($C-C((((l(_5L5L5__5L5_opiq

H::9;,&-(/"&+piq(,:(*@*,>*G>$('",&:'%9/',"&(:$f9$&/$%

Page 44: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ))

)#*'(=*:($*:BCCC

<9'(>$'E:(+"(:";$'#,&-(#*%+$%=#$%$('#$([2346\('#%$*+:'*D$(+,AA$%$&'(?*'#:

Page 45: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & )*

P%$+,/*',"&($M$/9'$:(*>>(?*'#:S(234)("&>B($M$/9'$:(,&:'%9/',"&:(

9:$+(*'(>$*:'("&/$

:=,'/#1A1,88(l/*:$(_h(3_S(G%$*DS/*:$(Lh(3LS(G%$*DSm/*:$(nh(3nS(G%$*DSo

0*&$(_)#%$*+(_

0*&$(L)#%$*+(L

0*&$(b)#%$*+(b

0*&$(g)#%$*+(g

_

b

d

n

V*:'$+

V*:'$+

0*&$(_)#%$*+(_

0*&$(L)#%$*+(L

0*&$(b)#%$*+(b

0*&$(g)#%$*+(g

L

b

d

n

0$::(V*:'$

0$::(V*:'$

Page 46: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & )+

V#B(&"'(*(>""?(G9AA$%(*'($*/#(>*&$Y

0*&$(_)#%$*+(_

3_

0*&$(L)#%$*+(L

0*&$(b)#%$*+(b

0*&$(_)#%$*+(_

0*&$(L)#%$*+(L

0*&$(b)#%$*+(b

3_A"%(*>>('#%$*+:(3(((%$?$*'(L__((((((A"%(*>>(@$/'"%($>$;$&':(i(((((((((3Lh(,A(/"&+p,5iq('#$&((((((((((((3bh(@piq(K(CCC3g

V#$%$(/"&+p,5iq(K((((*%G,'%*%B5($C-CL_LLL_____LL((((

H::9;,&-(/"&+p,5iq(,:(7U)(*@*,>*G>$('",&:'%9/',"&(:$f9$&/$%

3LCL3bCL3LCb3bCb3LCg3bCg3LCa3bCa3g

3_3LCL3bCL3LCb3bCb3LCg3bCg3LCa3bCa3g

3_3LCL3bCL3LCb3bCb3LCg

3LCa3bCa3g

3_3LCL3bCL3LCb3bCb3LCg3LCa

3g

3_3LCL3LCb3bCb3LCg3LCa3bCa

3g

3_3LCL3bCL3LCb3LCg

3LCa3bCa3g

3bCg3bCg

V*,'(9?3A$'/#$:(:*@$+

Page 47: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ),

?/

,s

+

:

M

?/

%A

+s

M

?/

%A

+s

M

?/

%A

+s

M

?/

%A

+s

M

?/

%A

+s

4$;"%B(W(s(:#*%$+(G$'=$$&(.$/'"%(0*&$([P%"/$::"%:\

.$/'"%(0*&$()#%$*+,&-0""?(<9AA$%:

,>G ,>G ,>G ,>G ,>G

Page 48: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & )"

2346(y(4346(:?$/'%9;

! 4"%$(:'9AA(%$?>,/*'$+(?$%(>*&$

" 3&:'%9/',"&(G9AA$%

" 0""?(<9AA$%

CCC

! 0$::(2346(,+>$&$::

! 4"%$(+$/"9?>$+('#$(?%"/$::"%:

! 4"%$(/"9?>$+('#$(?%"/$::"%:

! 4"%$(*%$*W?"=$%(:*@$+

! 4"%$(:'9AA(:#*%$+

Page 49: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & )%

O$i,--$%,&-()#%$*+:(G$'=$$&(!B/>$:("A(*(;9>',I/B/>$(V*%?

3L%$?$*'(,K_('"(CCC((((A"%('#%$*+:('K_('"(n

(,A($@$&1,8(]UO($@$&(1'8((('#$&(3b(((($>:$(3g

3a

0*&$(_)#%$*+(_

'_

0*&$(L)#%$*+(L

0*&$(b)#%$*+(b

0*&$(g)#%$*+(g

'L 'b 'g

U%,-,&*>5(3;G*>*&/$+(K(d_r(9',>,F$+

U9%(V*%?U'#$%(V*%?:(CCC

'd 'a 'n '`

,b,bU'#$%(V*%?:(CCC,g,gU'#$%(V*%?:(CCC,b,bU'#$%(V*%?:(CCC,g,g

'L 'g'd 'n

'_ 'b'a '`

'L 'g'd 'n

'_ 'b'a '`

'_ 'L 'b 'g

O$i,--$%$+(=,'#,&(V*@$A%"&'(K(L__r(9',>,F$+

U9%(V*%?U'#$%(V*%?:(CCC

'd 'a 'n '`

U'#$%(V*%?:(CCC

U'#$%(V*%?:(CCC

U'#$%(V*%?:(CCC

'L 'g'd 'n'_ 'b'a '`,g

,b

'L 'g'd 'n'_ 'b'a '`,b

,g

'L 'g'd 'n'_ 'b'a '`,b

,g

NM*;?>$(*::9;$:)#%$*+(:D$=,&-(G$'=$$&(=*@$:(,&(*(=*@$A%"&'

Page 50: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & *!

H'(>$*:'(a(>$@$>:("A('#%$*+W=*%?(%$G*>*&/,&-

! O$i,--$%,&-(=,'#,&(*(=*%?(1=*@$A%"&'5(;9>',/B/>$(234)('#%$*+(-%"9?8(G$'=$$&(/B/>$:(1?#*:$:5(=*@$:8("A(:*;$(=*%?

" R*,%>B($*:B(^(i9:'(*(A$=(;"%$(G,':(W(>"-,/("&(;*:D

! 4,-%*',&-(1$M/#*&-,&-8('#%$*+:(G$'=$$&(=*%?:(G9'(:'*B,&-(,&(:*;$(>*&$

" R*,%>B($*:B(^(;"%$(G,':

! 4,-%*',&-('#%$*+:(G$'=$$&(>*&$:("A(:*;$(=*%?

" T*%+(^(,&@">@$:(/"?B,&-(%$-,:'$%:C

" P%"G*G>B(&"'(="%'#(+",&-5($M/$?'(A"%(OR(>*&$('%,/D:

! O$G9,>+,&-(=*%?:((1=*@$A%"&'5(;9>',/B/>$(234)('#%$*+(-%"9?8(/";?>$'$>B

" T*%+(^(,&@">@$:(/"?B,&-(%$-,:'$%:

" P%"G*G>B("&>B(="%'#=#,>$(*A'$%(%$*>>B(>"&-(>*'$&/B($@$&':C

Page 51: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & *'

CCC CCC

T"=(*G"9'(O$G*>*&/,&-()#%$*+:G$'=$$&(V*%?:Y

3L%$?$*'(,K_('"(CCC(((A"%(=*%?:(=K_('"(L((((((A"%('#%$*+:('K_('"(g

(,A($@$&1'Q,8((('#$&(3b(((($>:$(3g

3a

0*&$(_)#%$*+(_

3L

0*&$(L)#%$*+(L

0*&$(b)#%$*+(b

0*&$(g)#%$*+(g

3L3b3b

3b3b

CCCCCC3a3a

3L3L

3g3g

3g3gCCCCCC3a3a

3L3L3b3b

3b3b

CCCCCC3a3a

3L3L

3g3g

3g3gCCCCCC3a3a

3b 3b

CCCCCC3a3a

3g3g

CCC3a3a

3b 3b

CCCCCC3a3a

3g3g

CCC3a3a

U%,-,&*>5(3;G*>*&/$+

O$G*>*&/$+

V*%?(_

V*%?L

7$=(V*%?(_

7$=(V*%?L

Page 52: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & *$

T"=('"(O$G*>*&/$()#%$*+:(G$'=$$&(V*%?:

! !";?,>$%(:#"9>+(-%"9?(:,;,>*%('#%$*+:(*:(;9/#(*:(?"::,G>$

" T,&':('"('#%$*+(:/#$+9>$%

" <9'(:9/#(:'*',/(G*>*&/,&-(/*&("&>B(-"(:"(A*%

! 6B&*;,/(G*>*&/,&-

" 4"@,&-('#%$*+:(G$'=$$&(=*%?:=,'#,&('#$(:*;$(>*&$I*>,-&$+(%$-,:'$%(A,>$(>""D:(+"*G>$(1/#*&-$("A(?",&'$%8(1TVY8

" 4"@,&-('#%$*+:(G$'=$$&(>*&$(OR:,&@">@$:(/"?B,&-(Kz(Z/"+$Y(2"A'=*%$Y(24:Y

Page 53: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & *(

O$;$;G$%(3&'$%>$*@$+(@:C(2"N4))#%$*+,&-(=,'#,&(0*&$:Y

0*&$(_)#%$*+(_

0*&$(L)#%$*+(L

0*&$(b)#%$*+(b

0*&$(g)#%$*+(g

L L L L

U%,-,&*>5(3;G*>*&/$+

V*%?Lb b b bV*%?bg g g gV*%?gL L L LV*%?Lb b b bV*%?bg g s;,:: gV*%?gL L L LV*%?Lb b b bV*%?ba a a aV*%?a

g g g gV*%?gb b b bV*%?ba a a aV*%?a

m(=*,'(',>>(V*%?(g(;,::(%$'9%&:%$?>*/$:(V*%?(L(,&(%9&&,&-(>,:'

4,&,;9;(>*'$&/B(A"%(+*'*(+$?$&+$&/,$:#,++$&(GB(34)(13&'$%>$*@$+(49>','#%$*+,&-8

0"&-(>*'$&/,$:5($:?C(9&?%$+,/'*G>$(>"&-(>*'$&/,$:1)0<(;,::5(/*/#$(;,::8(#,++$&(GB(2"N4)

U?',"&(_h(G>"/D(*>>('#%$*+:(,&(=*%?(,A(*&B(;,::$:

U?',"&Lh(:?>,'(=*%?('"(*>>"=(9&G>"/D$+('#%$*+:('"(%9&

Page 54: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & *)

0,''>$(X&"=&(R*/'

! 3(+$@$>"?$+('#$(P`I:'B>$(O2(,&(?*%'A"%(@$/'"%(:/*''$%W-*'#$%

" H>>"=($*%>,$:'($>$;$&':('"(/";?>$'$(H2HP

" 3&:'$*+("A(=*,',&-(A"%(*>>(?,$/$:('"(*%%,@$

! P%"G>$;h("&/$(@$/'"%(,:(:?>,'(9?(,&'"($>$;$&':#*%+('"(%$-*,&($AA,/,$&/B("A(/"&'%">>,&-(*>>(@$/'"%(*'(*(',;$C

! .$/'"%('#%$*+,&-(;*D$:('#,:($*:BCX$Bh(P!

" 0,'$%*>>B5('#$(P!(1?$%(@$/'"%(>*&$('#%$*+8(,:('#$([D$B\*:(,&(+*'*G*:$(i",&(D$B(^('#$('#,&-('#*'('$>>:(B"9(#"=('"(-$'('#%$*+:(G*/D(,&'"(*>,-&;$&'C

Page 55: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & **

<"''";(0,&$

! )#$%$E:(*(#$>>("A(*(>"'('#*'(B"9(/*&(+"('"(*@",+(2346(=*:'*-$(@:C(P*%*>>$>(.$/'"%(V*:'*-$

" ),;$(?,?$>,&$+(@$/'"%:(,&(>*&$:

" U@$%>*??,&-(+,AA$%$&'(;9>',/B/>$(,&:'%9/',"&:

" 0""?(G9AA$%:(?$%(>*&$

" O$G*>*&/,&-5($C-C(*A'$%(-*'#$%

! Z&/>$*%(=#,/#(*%$(+"&$(,&(?%$:$&'(JPZ:

" 4":'(:$$;('"(G$

" 2";$(*%$(,&(%$/$&'(+,::$%'*',"&:

Page 56: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & *+

29;;*%B("A(234)(.PZ(Z',>,F*',"&(3;?%"@$;$&':

! 234)(,&#$%$&'>B(*&+(A9&+*;$&'*>>B(,:(;"%$($AA,/,$&'5(A,>'$%,&-("9'(&9>>(;*:D:C(14,&"%8(((

! 234)(Q(@$/'"%(',;$(?,?$>,&,&-

" 4"%$($AA,/,$&'(;*:D,&-(*&+(.0(14*i"%8(

" NM'%*(,&:'%9/',"&(A$'/#("??"%'9&,',$:(+9%,&-(;9>',/B/>$(@$/'"%("?$%*',"&:C(14*i"%8

! H>:"h($M'%*(,&:'%9/',"&(A$'/#("??"%'9&,',$:(+9%,&-(;9>',/B/>$(=*@$A%"&':(14*i"%8

! 234)(%$G*>*&/,&-(

" O$i,--$%,&-(=,'#,&(=*%?h(:*;$(>*&$5(+,AA$%$&'(/B/>$:(14*i"%8

" 4,-%*',&-(G$'=$$&(=*%?:(*'(:*;$(>*&$(14,&"%8

" O$G9,>+,&-(=*%?:(14*i"%5(G9'(/";?>$M8

! U'#$%h(:$$(R9&-C(($C-C(:D$=,&-

Page 57: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & *,

<"''";(0,&$b

! 234)(W(2346(W(.$/'"%(0*&$()#%$*+,&-

! !#$*?(;,/%"*%/#,'$/'9%$(A"%('#%$*+:'#*'(#*@$(:,;,>*%(1G9'(&"'(,+$&',/*>8(/"&'%">(A>"=

" 2#*%$:(A%"&'($&+(m

! 2*;$([32H\(1/"&/$?'9*>8(/*&(%9&("&(43465(,A('#*'(,:(G$''$%(A"%(="%D>"*+

" ,C$C(234)(W(2346(W(.$/'"%(0*&$()#%$*+,&-,:(;,/%"*%/#,'$/'9%$5(&"'(;*/%"*%/#,'$/'9%$

" !*&(%9&(*&B(%$*:"&*G>$(,&:'%9/',"&(:$'(=,'#,&(@$/'"%(>*&$(

! 7,/$h(N763R5(N760UUP(/"&@$%-$&/$(,&+,/*',"&:

Page 58: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & *"

<"''";(0,&$g

! 234)(W(2346(W(.$/'"%(0*&$()#%$*+,&-

" N&/"9%*-$(?%"-%*;;$%('"(#*@$(>"':("A(>,-#'=$,-#'('#%$*+:

! J0NV(UP373U7h

" 3'(,:($*:,$%('"(=%,'$(?%"-%*;:(=,'#(L___:("A('#%$*+:('#*&(=,'#(L_:("A('#%$*+:

" POH4(:'B>$(?%"-%*;;,&-

! )#$:$(*%$(7U)(i9:'(@$/'"%(;*/#,&$:C

Page 59: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & *%

)"?(L_(%$*:"&:(=#B(@$/'"%(>*&$('#%$*+,&-(1*D*(234)5(*D*(734)8(,:(G$''$%('#*&(L`(=,+$(2346(@$/'"%(?*%*>>$>

LC'@'345+';.;"=.*"'#)*'$&.<.)==A'&?*'3456'<"#$%&';)&)=="='#%-"7'B?$'*%$'<.#"'<"&1)8

bC(3A(<"#$%&'@(C'?$.=.D)$.%*(,:(*&(,::9$5('#$&(234)(?%"G*G>B(#*:(1"%(/*&(G$(G9,>'('"(#*@$8(:,-&,A,/*&'>B(G$''$%(@$/'"%(H0Z($AA,/,$&/BC234)(,:(>$::(@9>&$%*G>$('"(?%"G>$;:(+9$('"(@$/'"%(>$&-'#(*&+(@$/'"%(;*:DC()#,:(,:($M?>*,&$+(,&('#$(:>,+$:C

E8'345+'.1'="11'-";"*-"*$'%*'1%F$0)&"'-"<"=%;>"*$8e"9(/*&(/#*&-$('#$([@$/'"%(=,+'#\("A(*(234)(;,/%"*%/#,'$/'9%$5(*&+(*>>($M,:',&-(/"+$(/"&',&9$:('"(%9&5(9:9*>>B(:',>>($AA,/,$&'>BCV#$%$*:(,A(B"9(/#*&-$5($C-C(A%";(dLb(G,'(@$/'"%:('"(L_ba(G,'(@$/'"%:5(2346(?*%*>>$>(@$/'"%(:"A'=*%$(&$$+:('"(G$(%$=%,''$&C

aC(2346(@$/'"%(?*%*>>$>(32H:(*>=*B:(:>,+$(+"=&('#$(:>,??$%B(:>"?$("A(/%"::(<"#$%&'=)*"'.*1$&?#$.%*1C{(R,%:'(B"9(=*&'(,&&$%(?%"+9/'5('#$&(%$/9%%$&/$:5('#$&m{(234)(+"$:&|'(#*@$('#,:('$;?'*',"&5(:,&/$('#$(+,AA$%$&'(>*&$:(*%$(A%";(+,AA$%$&'('#%$*+:C

G8'345+'1"">1'$%'B"')'B"$$"&'>)$#,'F%&'1%F$0)&"C{(3(;9:'(*+;,'('#*'(3(=*:(:9%?%,:$+(=#$&(3(>$*%&$+('#,:5(:,&/$(3|;(*(@$/'"%(-9B(A%";(=*B(G*/DC{(<9'(,'(:$$;:('"(G$('#$(/*:$C{(6.&"#$HI1'@94'.1'B)1.#)==A')'345+'@94C

e"9(&$$+(*(-""+(/";?,>$%('"(/"&@$%'(A%";(234)(6,%$/']('"(2346(?*%*>>$>(@$/'"%:C{(

`C(234)(;*B(G$(/"&:,+$%$+(i9:'(*(:'$?(*>"&-('#$(%"*+('"(5456C

nC()#$%$(*%$(:,-&,A,/*&'(;%0"&'1)<.*/1'%;;%&$?*.$."1(+9$('"(',;$(?,?$>,&,&-(@$/'"%:C

3C$C(2346(?*%*>>$>(@$/'"%:(=*:'$(?"=$%5(G$/*9:$('#$B(+"(&"'('*D$(*+@*&'*-$("A('#$(/"%%$>*',"&(,&(+*'*(?*''$%&:(G$'=$$&(:9//$::,@$(@$/'"%($>$;$&':C

V#,>$(,'(,:(?"::,G>$('"(G9,>+(*(&"&I234)(@$/'"%(?,?$>,&$('#*'('*D$:(*+@*&'*-$("A('#,:5(,'(,:(&"'(2346(@$/'"%(?*%*>>$>C{{(NC-C(,'(,:(@$/'"%(',;$(?,?$>,&$+5(=,'#(/#*,&,&-C{(NC-C(,&:'$*+("A(*(L`(=,+$(@$/'"%(;*/#,&$5(B"9(;,-#'(G9,>+(*(@$/'"%(;*/#,&$('#*'('*D$:(a(/B/>$:('"($M$/9'$(*(@$/'"%(,&:'%9/',"&("&(a(H0Z:5(=#,/#(D$$?:(a(:9/#(,&:'%9/',"&:(,&(A>,-#'5(/#*,&,&-(,&'"($*/#("'#$%('"(D$$?($*/#("'#$%(G9:BC

3|@$(="%D$+("&(;*/#,&$:('#*'(+,+(:9/#(/#*,&,&-C(1J"9>+8C{(3&(A*/'5(3(,&@$&'$+('#$(P`(O2(*:(*(/#*,&,&-(/"&'%">(9&,'C()#$B(*%$(/";?>$MC

234)(#*:(;9/#(:,;?>$%(?,?$>,&$(/"&'%">('#*&(@$/'"%(/#*,&,&-C

JK7'1%'$,)$L1'%*=A'M'&")1%*18'5%&"'0.=='B"'#%>.*/8'

Page 60: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & +!

<*/D9?

Page 61: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & +'

)"(0$*%&(4"%$

! 6*@,+(X*&'$%5(O$*>(V"%>+()$/#(*%',/>$h" #''?hWW%$*>="%>+'$/#C/";W?*-$C/A;YH%',/>$36KOV)_w_c_cLwdbab}?KLb

" 3'(,:(GB(&"(;$*&:(/";?>$'$>B(*//9%*'$C((R"%($M*;?>$5(3(+"&E'('#,&D('#*'('#$(*9'#"%(9&+$%:'*&+:('#$(?"'$&',*>("A(234)(I(#$('#,&D:('#*'(234)(=,>>(i9:'($@">@$(G*/D('"(2346C((T$('#,&D:('#*'('#$(~$&:$;G>$~(,&'%*I=*%?(,&:'%9/',"&:(*%$(*(:'$?(>,D$('#,:5(=#$%$*:(3(:$$('#$:$(*:(G$,&-(;9/#(;"%$(>,D$(;9>',I'#%$*+(G*%%,$%(,&:'%9/',"&:C

" 7$@$%'#$>$::5(,'(,:('#$(G$:'(3(#*@$(A"9&+(:"(A*%C((3'(,:(-""+($&"9-#('#*'(7@,+,*(A">D(#*@$(:'*%'$+(%$A$%%,&-(?%"-%*;;$%:('"('#,:(*%',/>$5(%*'#$%('#*&(7@,+,*(,&'$%&*>(+"/9;$&':C

! Z3Z!(JPZ(/"9%:$5(V$&I4$,(T=9h" #''?hWW/"9%:$:C$/$C,>>,&",:C$+9W$/$awcW*>W

! V,>:"&(V*,(09&(R9&-5(Z<!(42('#$:,:" #''?:hWW/,%/>$C9G/C/*WG,':'%$*;WbabwWbb`cWLW9G/�b__c�A*>>�A9&-�=,>:"&�=*,�>9&C?+A

" 6B&*;,/(V*%?(R"%;*',"&h(NM?>",',&-()#%$*+(2/#$+9>,&-(A"%(NAA,/,$&'(4346(!"&'%">(R>"=("&(2346(J%*?#,/:(T*%+=*%$

Page 62: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & +$

!"@$%,&-(4B(H::(

! )#,:(?%$:$&'*',"&(,:(7U)(*G"9'(0*%%*G$$

! <9'(A">D:("A'$&(*:D(A"%(0O<(%$A$%$&/$:h

" !"##"$%%&'(')"*+,-.#%'/01'(#2345%256#%'7.#'8496":'-.;<654*=>-.//0&12342/'5&6789&:./;2.<'5&=/3>&1?/.<942'5&@7;&A7/B0CD'5&E3>D.24&FG/.BD$5H/.I22?&68G20'5&1C2?D2<&J8<K3<B'5&FI.;&-.K2'5&J2/2;0&1892/;.<(5&L7G2/C&:.M3<'5&L792/&=B?.B.'5&=I&N/7>D7OBK3'5&@7<3&J8.<'5&.<I&H.C&P.</.D.<(F:E&@/.<B.>C37<B&7<&N/.?D3>B5&Q74R&$,5&S7R&(5&F/C3>42&'"5&H8G43>.C37<&I.C2T&F898BC&$!!"RDCC?T##I7O<47.IR3<C24R>7;#C2>D<74790#./>D3C2>C8/2UB343>7<#1399/.?DV-.//.G22V?.?2/R?IW

" P2>K5&-LX&D.B&.&O3K3?2I3.&?.92Y

Page 63: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & +(

H/*+$;,/(V"%Dh(R9&-

! V,>:"&(V*,(09&(R9&-5(Z<!(42('#$:,:" #''?:hWW/,%/>$C9G/C/*WG,':'%$*;WbabwWbb`cWLW9G/�b__c�A*>>�A9&-�=,>:"&�=*,�>9&C?+A

" 6B&*;,/(V*%?(R"%;*',"&h(NM?>",',&-()#%$*+(2/#$+9>,&-(A"%(NAA,/,$&'(4346(!"&'%">(R>"=("&(2346(J%*?#,/:(T*%+=*%$

! ["#$%&'%'(#$)*+,-./$=*%?(A"%;*',"&(*&+(:/#$+9>,&-5(*(;$/#*&,:;(A"%(;"%$($AA,/,$&'(2346(G%*&/#($M$/9',"&("&(JPZ:C(3'(+B&*;,/*>>B(%$-%"9?:('#%$*+:(,&'"(&$=(=*%?:("&('#$(A>B(A">>"=,&-('#$("//9%%$&/$("A(+,@$%-,&-(G%*&/#("9'/";$:C(V$(:#"=('#*'(*(%$*>,:',/(#*%+=*%$(,;?>$;$&'*',"&("A('#,:(;$/#*&,:;(,;?%"@$:(?$%A"%;*&/$(GB(*&(*@$%*-$("A(anr(A"%(*&($:',;*'$+(*%$*(,&/%$*:$("A(crC\

! 4346KLS(7ON!(2346(K(_CLcS(P6U4(2346(K(_CnS(6VR(2346(K(_Cw

" P6U4(G$:'("&(<>*/DI2/#">$:(1K43465(6VRK_CcdM8C(6VR(G$''$%($>:$=#$%$C

" <2h('%*&:/$&+$&'*>(:9G%"9',&$:(=,'#(G%*&/#$:

! P6U4h(4346K_Cb`5(cI=,+$K_CbL5(L`I=,+$K_CLw5(gbI=,+$K_CL`

! O$*>(?%"-%*;h(7ON!K_Can5(P6U4K_Ccg5(U%*/>$K_Cca

! O$-,:'$%(>*&$(*=*%$(+B&*;,/(=*%?(A"%;*',"&

! V*%?(:/#$+9>,&-(?">,/,$:h(),;$(2'*;?5(P%"-%*;(!"9&'$%5(?%$A$%(4*i"%,'B5(?%$A$%(4,&"%,'B5(P6U4(?%,"%,'B

" 4*i"%,'B(9:9*>>B(G$:'S(:";$',;$:(P!("%(P6U4(?%,"%,'B

Page 64: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & +)

H/*+$;,/(V"%Dh(X"FB%*D,:($'(*>5(.$/'"%(0$&-'#()#%$*+,&-

! X"FB%*D,:($'(*>5(.$/'"%(0$&-'#()#%$*+,&-

" !#%,:'":(X"FB%*D,:(12'*&A"%+(?%"A8(=*:(@$%B(#*??B('"(#$*%('#*'(3(=*:(/*>>,&-(234)W634)W734)(@$/'"%(>$&-'#('#%$*+,&-5(-$&$%,/*>>B5(:,&/$(#$(#*:(?9G>,:$+(*(?*?$%(GB('#*'(&*;$h

! !,'*',"&h(29F*&&$(O,@",%$5(O$G$//*(2/#9>'F5()";"A9;,(UD9+*5(!#%,:'":(X"FB%*D,:5(~.$/'"%(0*&$()#%$*+,&-5~(,/??5(??CddI`a5(b__`(3&'$%&*',"&*>(!"&A$%$&/$("&(P*%*>>$>(P%"/$::,&-(13!PPE_`85(b__`

" Z&A"%'9&*'$>B5('#$,%(9:$("A('#$('$%;([.0)\(+,AA$%:(A%";(;B(9:$h

! HG:'%*/'($M/$%?'h([=$(?%"?":$(@$/'"%(>*&$('#%$*+,&-(1.0)85(*&(*%/#,'$/'9%*>($&#*&/$;$&'('#*'(*>>"=:(,+>$(@$/'"%(>*&$:('"(%9&(:#"%'I@$/'"%("%(:/*>*%('#%$*+:C\

! 3(>,D$('#$('$%;([@$/'"%(>$&-'#('#%$*+,&-\(G$/*9:$(,'(*//9%*'$>B(+$:/%,G$:(;B(9&+$%:'*&+,&-("A(234)W634)W734)C

! 3(*;(A9&&B(>,D$('#*'h(3(>,D$(;B(/";?":,'$('$%;:('"(#*@$(*//9%*'$(;$*&,&-:C((3(=,>>(&"'(/#*&-$(;B(9:*-$("A('#,:('$%;(*'('#,:(',;$C(3('#,&D('#$(/"&A9:,"&(,:(;,&"%C

" )#$,%(.0)(?*?$%($;?#*:,F$:(&"&I/"#$%$&/$h($M$/9',&-(+,AA$%$&'(?%"-%*;:5(+,AA$%$&'(/"+$:5(=,'#($*/#(-$'',&-(*(+,AA$%$&'(&9;G$%(,A(@$/'"%(>*G$:C()#$B('*>D(*G"9'(%9&&,&-(?9%$(:/*>*%('#%$*+:(A"%(*??>,/*',"&:('#*'(?*%*>>$>,F$(G9'(=#,/#(+"(&"'(@$/'"%,F$C(

! 49/#(>,D$(;B(734)C(H>'#"9-#(;B(734)(#*:(G$$&($;?#*:,F,&-(+,@$%-$&/$(=,'#,&(*(-%"9?("A(:,;,>*%('#%$*+:5(&"'(+,:?*%*'$('#%$*+:C

" N;?#*:,F$:(.0(z(P.V5(=,'#(',;$(?,?$>,&,&-C(.*%,*G>$(>$&-'#(@$/'"%:C

" [X2>.8B2&M2>C7/&3<BC/8>C37<B&B?2>3W0&;84C3?42&$>$;$&'("?$%*',"&:5($*/#(,&:'%9/',"&("//9?,$:(*(@$/'"%(A9&/',"&*>(9&,'(A"%(:$@$%*>(/B/>$:($@$&(,&(;9>',I>*&$(,;?>$;$&'*',"&:C(T$&/$5('#$(.!0(:'%9/'9%$:(*%$(;9/#(:,;?>$%('#*&('#$,%(:/*>*%(/"9&'$%?*%':(G$/*9:$(*(>"=$%(,&:'%9/',"&(,::9$(%*'$(,:(:9AA,/,$&'(,&('#$(@$/'"%(9&,'(1'B?,/*>>B("&$("%('="(,&:'%9/',"&:(?$%(/B/>$8C\

" 49:'(A$$+(,&:'%9/',"&:('#$('#$(@$/'"%(9&,'5(@,*(:/*>*%(>"-,/(*&+W"%(@$/'"%(/"&'%">(>"-,/C((3::9$h(%$?>,/*'$("%(;9>',?>$MC

! )#$(?9%,:'(7@,+,*(234)(#*:(&"(:$?*%*',"&(^(:/*>*%(/"+$(%9&:(,&('#$(@$/'"%(>*&$:C

! T"=$@$%5(3(#*@$(G$$&('#,&D,&-("A(!2N,&-(:/*>*%(/"+$(/";;"&('"(/"#$%$&'('#%$*+:C(NC-C(>""?(/"&'%">C

" N@*>9*'$:("&(*(;,/%"*%/#,'$/'9%$(=,'#(:/*>*%W@$/'"%C(2/*>*%(,:(:9?$%:/*>*%5(24)5(UUUC

" 3(:$$(*G:">9'$>B(&"(;$&',"&("A(/"#$%$&/$5("A(%$/"-&,F,&-(=#$&('#$(:*;$(,&:'%9/',"&(,:(G$,&-($M$/9'$+(GB(;9>',?>$('#%$*+:C()#9:5(+,AA$%:(,&(*(A9&+*;$&'*>(=*B(A%";('#$(,+$*:(3(*;($M?>"%,&-CC

" <"''";(0,&$h(:,;,>*%5(G9'(:9G:'*&',*>(+,AA$%$&/$:C

Page 65: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & +*

H/*+$;,/(V"%Dh(X%*:#,&:DB5(H:*&"@,/($'(*>5(.$/'"%()#%$*+,&-

! X%*:#,&:DB5(H:*&"@,/5($'(*>5(.$/'"%()#%$*+,&-(1.)8C((H>:"5('#$([2/*>$\(/";?9'$%(*%/#,'$/'9%$

" O"&&B(X%*:#,&:DB(/"&'*/'$+(;$C(3($&'#9:,*:',/*>>B($&i"B$+(#,:(%$:$*%/#C()#,:(=*:("&$("A('#$(A,%:'([7$=(.$/'"%\(?*?$%:(3($&/"9&'$%$+C

! X%*:#,&:DB5(OC5(<*''$&5(!C5(T*;?'"&5(4C5(J$%+,&-5(2C5(P#*%%,:5(<C5(!*:?$%5(kC5(*&+(H:*&"@,/5(XC(b__aC()#$(.$/'"%I)#%$*+(H%/#,'$/'9%$C(3NNN(4,/%"(ba5(`(17"@C(b__a85(caIw_C(6U3K(#''?hWW+MC+",C"%-WL_CLL_wW44Cb__aCw_(

" NC-C(a(>*&$:C(.P(1@$/'"%(P%"/$::"%:8(:'%,?$+(*/%"::(>*&$:C(0*&$:(+$/"9?>$+(A%";($*/#("'#$%C(

" H3<:(1H'"5,/(3&:'%9/',"&(<>"/D:85(%*#$%(>,D$(/$+>>:(^(&"(P!:5(H3<:(%$f9$:'($*/#("'#$%5("%(A%";(/"&'%">(?%"/$::"%C

! H3<:($M?":$(9*%/#(:'*'$(>,D$(/#*,&(1GB?*::8(%$-,:'$%:(*'($*/#(H0Z(,&?9'C

" .$/'"%(A$'/#(/";;*&+:(*%$(G%"*+/*:'('"(*>>(>*&$:C(N*/#('*-(/#$/D:(,&(*&(H3<(/*/#$C(1)#$(;"%*>($f9,@*>$&'("A(*&(3s(?$%(>*&$S($M/$+?'('#*'('#$(,&:'%9/',"&:(*%$('#$:$(H3<(?*/D$':C8

" 7"(;$&',"&("A(/"#$%$&/$5(:#*%,&-("A(,A$'/#(G$'=$$&(:$?*%*'$('#%$*+:C((2$$;:(A9%'#$%(*=*B(A%";(234)C

" X%:'$(H:*&"@,/(%$/";;$&+$+('#$(A">>"=,&-(?*?$%:(A%";(#,:(-%"9?

! O"&&B(X%*:#,&:DB5(!#%,:'"?#$%(<*''$&5(4*%D(T*;?'"&5(2'$@$&(J$%+,&-5(<%,*&(P#*%%,:5(k*%$+(!*:?$%5(*&+(X%:'$(H:*&"@,�5(~)#$(.$/'"%I)#%$*+H%/#,'$/'9%$~5(gL:'(3&'$%&*',"&*>(2B;?":,9;("&(!";?9'$%(H%/#,'$/'9%$(132!HIgL85(49&,/#5(J$%;*&B5(k9&$(b__aC#''?hWW===C/*-C/:*,>C;,'C$+9W:/*>$W?*?$%:W@'*I,:/*b__aC?+A

! 4*%D(T*;?'"&(*&+(X%:'$(H:*&"@,�5(~!";?,>,&-(A"%(.$/'"%I)#%$*+(H%/#,'$/'9%$:~5(3&'$%&*',"&*>(2B;?":,9;("&(!"+$(J$&$%*',"&(*&+(U?',;,F*',"&(1!JUIb__c85(<":'"&5(4H5(H?%,>(b__cC(#''?hWW===C/*-C/:*,>C;,'C$+9W:/*>$W?*?$%:W@'/";?,>$%I/-"b__cC?+A

! O"&&B(X%*:#,&:DB5(~.$/'"%I)#%$*+(H%/#,'$/'9%$(*&+(3;?>$;$&'*',"&~(P#C6C()#$:,:5(4*::*/#9:$'':(3&:','9'$("A()$/#&">"-B5(4*B(b__nC#''?hWW===C/*-C/:*,>C;,'C$+9W:/*>$W?*?$%:WD%*:#,&:DBI?#+C?+A

! O"&&B(X%*:#,&:DB5(!#%,:'"?#$%(<*''$&5(*&+(X%:'$(H:*&"@,�5(~3;?>$;$&',&-('#$(2/*>$(.$/'"%I)#%$*+(P%"/$::"%~5(H!4()%*&:*/',"&:("&(6$:,-&(H9'";*',"&("A(N>$/'%"&,/(2B:'$;:(1)U6HN285(Lg1g85(aLhLIaLhba5(k9>B(b__cC(#''?hWW===C$$/:CG$%D$>$BC$+9W�D%:'$W?*?$%:W*aLID%*:#,&:DBC?+A

! !#%,:'"?#$%(<*''$&5(T,+$'*D*(H"D,5(*&+(X%:'$(H:*&"@,�5(~)#$(!*:$(A"%(4*>>$*G>$(2'%$*;(H%/#,'$/'9%$:~5(V"%D:#"?("&(2'%$*;,&-(2B:'$;:(*'(aL:'(3&'$%&*',"&*>(2B;?":,9;("&(4,/%"*%/#,'$/'9%$(143!OUIaL85(0*D$(!";"5(3'*>B5(7"@$;G$%(b__c#''?hWW===C$$/:CG$%D$>$BC$+9W�D%:'$W?*?$%:W;*@$&I;,/%"I=::b__cC?+A

Page 66: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ++

!";?*%,&-('#,:(P%$:$&'*',"&E:(!"#$%$&'()#%$*+,&-(1!)8'"(X%*:#,&:DB($'(*>E:(.$/'"%()#%$*+,&-(1.)8

!"#$%&'(

! 4*&B(:,;,>*%,',$:C(3(*+;,'(,&:?,%*',"&(GB(X%*:#,&DBC(.)(=*:("&$("A('#$(A,%:'($M*;?>$:("A([7$=(.$/'"%:\('#*'(3($&/"9&'$%$+C(<9'(,;?"%'*&'(+,AA$%$&/$:5('"(;B(%)'*+

! ,-)'./0

" 123456789:3;&&%;3("3<&3-3*&#"$=.&*3;(>&-%)'?3->#@)(&#($>&3"'3-3A&#(">B.)C&3;$<;(>-(&+335D3&E=";&*+

" 623);3-3A&#(">B.)C&3;$<;(>-(&3)%=.&%&'()'?3.-'&3(@>&-*;+33!&#"$=.)'?F3&(#+F3;&&C)'?3("3<&3@)**&'F3;"3-;3'"(3("3<&3-'3"<;(-#.&3("3,G,!F3&A&'($-../+

)+&+3623-((&%=(;3("3@)*&3(@&3*&(-).;3-;3%)#>"->#@)(&#($>&F3>-(@&>3(@-'3&E=";)'?3(@&%3("3;"H(I->&3G57+3

Page 67: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & +,

M

?/

,s

+s

M M M

4

.03V

?

+ + + +%A

+ + + +

? ? ?

:

Page 68: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & +"

2#*%$+(4$;"%B

49>',/>9:'$%(49>','#%$*+,&-('"(234)

?/ ,s + %A

M

3&'%*/>9:'$%(GB?*::

+Ls

+bs

:

:

M

3&'%*/>9:'$%(GB?*::

+Ls

:

M

3&'%*/>9:'$%(GB?*::

+Ls

: M

3&'%*/>9:'$%(GB?*::

+Ls

:

RP !";;,'

3&(4!4)('#$(A%"&'($&+(,:(:#*%$+5(G9'(*&B(,&:'%9/',"&(-"$:('"("&$(*&+("&>B("&$(/>9:'$%C3&(234)5('#$(A%"&'($&+(,:(:#*%$+5(H76(*&B(?*%',/9>*%(,&:'%9/',"&(;*B(-"('"(;*&B(/>9:'$%:(1*D*(>*&$:8C

4!4)(*&+(234)(:#*%$('#$(A%"&'($&+5(G9'(%$?>,/*'$(?,?$:'*-$:(,&@">@$+(,&('#$(/%,',/*>(>""?

Page 69: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & +%

?/

,s

+

:

M

=_'_

%A

M

=_'L

%A

M

=_'b

%A

M

=_'g

%A

s(W(4

O9&&,&-(2346(@$/'"%(?*%*>>$>(/"+$"&(234)(@$/'"%(>*&$('#%$*+$+(9*%/#

)L )L )L )L

=b'_ =b'L =b'b =b'g

=g'_ =g'L =g'b =g'g

V*%?(_(K(7(234)('#%$*+:

V*%?(L(K(L(7I=,+$((2346('#%$*+

V*%?(b(K(7(234)('#%$*+:

V*%?(g(K(7(234)('#%$*+:

Page 70: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ,!

O9&&,&-(2346("&(234)5(*&+(@,/$(@$%:*

3456'%*'345+

! !%$*'$(*([=*%?\("A(7(@$/'"%(>*&$('#%$*+:

! NM$/9'$(,&(>"/D:'$?(1=#,/#(#*??$&:(&*'9%*>>B("&(234)8

! Z:$(?%$+,/*',"&("%(G%*&/#$:('"($;9>*'$(@$/'"%(;*:D:(

V#BY

(()"(:?$$+(9?(>*'$&/B(:$&:,',@$('#%$*+Y(1$@$&(,A(,'(=*:'$:(CCC8

345+'%*'3456

O$?>*/$(*>>(;*:D$+("?$%*',"&:

!!"#!$!%&'"()"*+!,-#./!0

V,'#('$:':(A"%(&9>>(;*:D

(!12!0!$!3!4%5%!678&!!"#!$!%&'"()"*+!,-#./!0

678&9!:

3A(&9>>(;*:D:(*%$(*'(*>>(>,D$>BC

V#BY(

((<$/*9:$(,'(/*&(G$(A*:'$%($@$&("&(*(2346S(G$''$%(:',>>('"(%$"%-*&,F$(/"+$('"(G%*&/#(*%"9&+(;*&B(:9/#("?$%*',"&:C

((<9'(+"$:&E'('*D$(*+@*&'*-$("A(234)(TV("?',;,F*',"&:('"(,;?%"@$(9',>,F*',"&C

V#,/#(+"(B"9('#,&D(,:(;"%$($AA,/,$&'Y(

Page 71: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ,'

6*'*(P*''$%&(6$?$&+$&'(P"=$%(U?',;,F*',"&(),;$(P,?$>,&$+(.$/'"%:! UG:$%@*',"&h(*+i*/$&'(?,M$>:(/>":$(,&(@*>9$

P>*&*%((1:"*8("%((!#9&DB(1*":h(MBF=("%(%-G*("%(m8

! 2346(@$/'"%:(?9'(*+i*/$&'(?,M$>:(,&(+,AA$%$&'(@$/'"%(>*&$:C()#,:(,:(G*+C

! O9&&,&-(&$*%GB(?,M$>:('#%"9-#(:*;$(@$/'"%(>*&$((>$::('"-->,&-( (>$::(?"=$%0 0

! !%$+,'h()P

" 3+$*h(,&:'$*+("A(L`(=,+$(23465(,;?>$;$&'(a(:$':("A(a(=,+$(2346('*D,&-(a(/B/>$:('"($M$/9'$(*(L`(=,+$(@$/'"%(,&:'%9/',"&C(V,'#(/#*,&,&-C(1!";?>,/*'$+(/"&'%">C8

" U%h(9:$(234)5(=,'#(@$/'"%(>*&$('#%$*+:("?$%*',&-("&(/#9&D:(1$C-C(/*/#$(>,&$:85(*:("??":$+('"(?*%',',"&,&-(*(/*/#$(>,&$(G$'=$$&(@$/'"%(>*&$('#%$*+:C((12*;$("?',;,F*',"&(*:(A"%(4346C8

Page 72: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ,$

?/

,s

+

:

M

?/

M

?/

M

?/

M

?/

s(W(4

3&'%*I>*&$(@:(3&'$%I>*&$(O$-,:'$%(R,>$:

V*%?(_

V*%?(L

V*%?(b

3&/%$*:,&-(t(,&'%*I>*&$('#%$*+:(1'%*+,',"&*>5(>*'$&/B('">$%*&'8(,&/%$*:$:(:,F$("A(?$%I>*&$(OR(Kz(:>"=$%(Kz(;"%$(?,?$:'*-$:("%(>"=$%(A%$f9$&/B

3&/%$*:,&-(t('#%$*+(>*&$:1@$/'"%(>*&$('#%$*+:5(234)W734)8+"$:(&"'(,&/%$*:$(OR(:,F$:,&/$(:$?*%*'$(*%%*B:Kz(>$::(,;?*/'("&(A%$f9$&/B

?/

%A%A%A%A

?/ ?/ ?/ ?/

?/ ?/ ?/ ?/

V*%?(_

V*%?(L

V*%?(b

Page 73: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ,(

?/

,s

+

:

M

?/

M

?/

M

?/

M

?/

s(W(4

.*%,*G>$(2,F$(O$-,:'$%(R,>$:

V*%?(_

V*%?(L

H>>(1Y8(JPZ:(#*@$(@*%,*G>$(&9;G$%:("A(%$-,:'$%:(?$%('#%$*+(1"%(?$%(234)('#%$*+(-%"9?5($C-C(=*%?8C

(0 )#%$*+(3+(:$>$/':(P!(Q(ORCG*:$ORCG*:$(Q(%$-,:'$%(&9;G$%(+,:'%,G9'$+

?/

%A%A%A%A

?/ ?/ ?/ ?/

?/ ?/ ?/ ?/

V*%?(_

V*%?(L

V*%?(b

Page 74: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ,)

?/,s+:

M

?/

M

?/

M

?/

M

?/

s(W(4

!>":$%(0""D(*'(234)(OR

?/

V*%?(_

'_?/ ?/ ?/ ?/'_?/ ?/ ?/ ?/'_?/ ?/ ?/ ?/'_

?/ ?/ ?/ ?/

V*%?(L

'_?/ ?/ ?/ ?/'_?/ ?/ ?/ ?/'_?/ ?/ ?/ ?/'_

V*%?(_

'_

'_

'_

'_

V*%?(_

'_

'_

'_

'_

%A%A%A%A

H>>(1Y8(JPZ:(-%"9?(@$/'"%(>*&$('#%$*+:(,&(;9>',I/B/>$(?*%*>>$>('#%$*+(-%"9?:C

NC-C(7@,+,*(=*%?(K(a](>*%-$%('#*&(&9;G$%("A(>*&$:

NC-C(H)3(=*@$A%"&'(K(a](>*%-$%('#*&(&9;G$%("A(1@>,=8(>*&$:

P%$-t(K('#%$*+C%ACG*:$(Q(0%$-t(ua(Q(/B/>$t

Page 75: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ,*

234)(%$-,:'$%(>""D9?h(:,;?>$(@:C(+B&*;,/

! 2,;?>$(>""D9?

P%$-t(K('#%$*+C%ACG*:$(Q(0%$-tua(Q(/B/>$

K(%ACG*:$p'#%$*+tq(Q(0%$-tua(Q(/B/>$

0%$-t(;9:'(G$(+,:'%,G9'$+('"(@$/'"%(>*&$:(=,'#($*/#(,&:'%9/',"&C(U'#$%(@*>9$:(/*&(G$(?%$>"*+$+C

! 4"%$(+B&*;,/(>""D9?

P%$-t(K('#%$*+C%ACG*:$(Q(0%$-tua(Q('#%$*+�,&�>*&$

,C$C(b(;"%$(G,':(?$%(>*&$(?$%(,&:'%9/',"&(;9:'(G$(+,:'%,G9'$+C

U%h(/";?9'$(A%";(>*&$(;*:D('#*'(/"&'%">:(=#$'#$%('#%$*+(,:(*/',@$(A"%(*(?*%',/9>*%(734)(,&:'%9/',"&

Page 76: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ,+

?/,s+:

M M M M

s(W(4

234)(ORh('%*+,&-(=,+'#(A"%(?"%':OR(>*&$:(K(b](H0Z(>*&$:?/

V*%?(_

>%$-_

>%$-L

>%$-b

>%$-g

V*%?(L

3A($*/#(OR([>*&$\(;*?:('"(b(H0Z([>*&$:\'#$&(%$i,--$%,&-(#*:(:',>>(;"%$("??"%'9&,',$:('"(,;?%"@$(.PZ(9',>,F*',"&C

O$+9/$(OR(?"%':(GB(*//$::,&-(;9>',?>$(/B/>$:E(="%'#("A(%$-,:'$%:(,&("&$(-"C

OR(;9M,&-(;*B(*>%$*+B($M,:'(A"%(:/*''$%(-*'#$%(*&+(:'%,+$+(%$-,:'$%(*//$::$:C(12";$(JPZ:(#*@$(:W-(OR:8

'_ 'L 'b 'g 'a 'd '` 'n

'_ 'L 'b 'g 'a 'd '` 'n

'_ 'L 'b 'g 'a 'd '` 'n

'_ 'L 'b 'g 'a 'd '` 'n

>%$-_

>%$-L

>%$-b

>%$-g

'_ 'L 'b 'g 'a 'd '` 'n

'_ 'L 'b 'g 'a 'd '` 'n

'_ 'L 'b 'g 'a 'd '` 'n

'_ 'L 'b 'g 'a 'd '` 'n

>%$-a

>%$-d

>%$-`

>%$-n

'_ 'L 'b 'g 'a 'd '` 'n

'_ 'L 'b 'g 'a 'd '` 'n

'_ 'L 'b 'g 'a 'd '` 'n

'_ 'L 'b 'g 'a 'd '` 'n

'_ 'L 'b 'g 'a 'd '` 'n

'_ 'L 'b 'g 'a 'd '` 'n

'_ 'L 'b 'g 'a 'd '` 'n

'_ 'L 'b 'g 'a 'd '` 'n

'_ 'L 'b 'g 'a 'd '` 'n

'_ 'L 'b 'g 'a 'd '` 'n

'_ 'L 'b 'g 'a 'd '` 'n

'_ 'L 'b 'g 'a 'd '` 'n

'_ 'L 'b 'g 'a 'd '` 'n

'_ 'L 'b 'g 'a 'd '` 'n

'_ 'L 'b 'g 'a 'd '` 'n

'_ 'L 'b 'g 'a 'd '` 'n

Page 77: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ,,

)#$(JP(OR(?%"G>$;

! !"&:,+$%(

" 32H(=,'#(A,M$+(t%$-:W'#%$*+

" $C-C(a(',;$("%(>*'$&/B('#%$*+:

" L`(>*&$:

" O(M(V(%$-:(1VKL`MgbKdLb(G,':8

! 2346(?*%*>>$>(#*:

" aO(M(V(%$-:('"'*>

! 0*&$('#%$*+,&-(#*:(>"-,/*>(OR(?$%(>*&$(?$%(',;$W>*'$&/B('#%$*+

J L`(>*&$:h(L`M(t%$-:

" V"%:$5("&>B(%$*+,&-(gb(G,':(?$%(>*&$(?$%(/B/>$(K(L`M([#,-#$%\

J (4"',@*'$:(=,+$%('#%$*+(>*&$:

" 7"'(gb(G,':

" P$%#*?:(Lbc(G,':Y(a]gb

" 2',>>(&"'(-""+(^(aM(t%$-:5(aM([#,-#$%\

4":'('#%$*+:(+"&E'(9:$(*>>(%$-:C

V*&'(@*%,*G>$(t%$-:W'#%$*+C(T*%+(,&(A,M$+(32HC(Y(b(>$@$>(OR(;,/%"*%/#,'$/'9%$Y(OR(sY

!#*&-$(A%";(>*&$('#%$*+:('"(2346('#%$*+:(*:(%$-(9:*-$(-%"=:Y

Page 78: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ,"

O$i,--$%,&-()#%$*+:(G$'=$$&(!B/>$:("A(*(;9>',I/B/>$(V*%?

3L%$?$*'(,K_('"(CCC((((A"%('#%$*+:('K_('"(n

(,A($@$&1,8(]UO($@$&(1'8((('#$&(3b(((($>:$(3g

3a

0*&$(_)#%$*+(_

'_

0*&$(L)#%$*+(L

0*&$(b)#%$*+(b

0*&$(g)#%$*+(g

'L 'b 'g

U%,-,&*>5(3;G*>*&/$+(K(d_r(9',>,F$+

U9%(V*%?U'#$%(V*%?:(CCC

'd 'a 'n '`

,b,bU'#$%(V*%?:(CCC,g,gU'#$%(V*%?:(CCC,b,bU'#$%(V*%?:(CCC,g,g

'L 'g'd 'n

'_ 'b'a '`

'L 'g'd 'n

'_ 'b'a '`

'_ 'L 'b 'g

O$i,--$%$+(=,'#,&(V*@$A%"&'(K(L__r(9',>,F$+

U9%(V*%?U'#$%(V*%?:(CCC

'd 'a 'n '`

U'#$%(V*%?:(CCC

U'#$%(V*%?:(CCC

U'#$%(V*%?:(CCC

'L 'g'd 'n'_ 'b'a '`,g

,b

'L 'g'd 'n'_ 'b'a '`,b

,g

'L 'g'd 'n'_ 'b'a '`,b

,g

NM*;?>$(*::9;$:)#%$*+(:D$=,&-(G$'=$$&(=*@$:(,&(*(=*@$A%"&'

Page 79: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ,%

H'(>$*:'(a(>$@$>:("A('#%$*+W=*%?(%$G*>*&/,&-

! O$i,--$%,&-(=,'#,&(*(=*%?(1=*@$A%"&'5(;9>',/B/>$(234)('#%$*+(-%"9?8(G$'=$$&(/B/>$:(1?#*:$:5(=*@$:8("A(:*;$(=*%?

" R*,%>B($*:B(^(i9:'(*(A$=(;"%$(G,':(W(>"-,/("&(;*:D

! 4,-%*',&-(1$M/#*&-,&-8('#%$*+:(G$'=$$&(=*%?:(G9'(:'*B,&-(,&(:*;$(>*&$

" R*,%>B($*:B(^(;"%$(G,':

! 4,-%*',&-('#%$*+:(G$'=$$&(>*&$:("A(:*;$(=*%?

" T*%+(^(,&@">@$:(/"?B,&-(%$-,:'$%:C

" P%"G*G>B(&"'(="%'#(+",&-5($M/$?'(A"%(OR(>*&$('%,/D:

! O$G9,>+,&-(=*%?:((1=*@$A%"&'5(;9>',/B/>$(234)('#%$*+(-%"9?8(/";?>$'$>B

" T*%+(^(,&@">@$:(/"?B,&-(%$-,:'$%:

" P%"G*G>B("&>B(="%'#=#,>$(*A'$%(%$*>>B(>"&-(>*'$&/B($@$&':C

Page 80: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & "!

29;;*%B("A(234)(.PZ(Z',>,F*',"&(3;?%"@$;$&':

! 234)(,&#$%$&'>B(*&+(A9&+*;$&'*>>B(,:(;"%$($AA,/,$&'5(A,>'$%,&-("9'(&9>>(;*:D:C(14,&"%8(((

! 234)(Q(@$/'"%(',;$(?,?$>,&,&-

" 4"%$($AA,/,$&'(;*:D,&-(*&+(.0(14*i"%8(

" NM'%*(,&:'%9/',"&(A$'/#("??"%'9&,',$:(+9%,&-(;9>',/B/>$(@$/'"%("?$%*',"&:C(14*i"%8

! H>:"h($M'%*(,&:'%9/',"&(A$'/#("??"%'9&,',$:(+9%,&-(;9>',/B/>$(=*@$A%"&':(14*i"%8

! 234)(%$G*>*&/,&-(

" O$i,--$%,&-(=,'#,&(=*%?h(:*;$(>*&$5(+,AA$%$&'(/B/>$:(14*i"%8

" 4,-%*',&-(G$'=$$&(=*%?:(*'(:*;$(>*&$(14,&"%8

" O$G9,>+,&-(=*%?:(14*i"%5(G9'(/";?>$M8

! U'#$%h(:$$(R9&-C(($C-C(:D$=,&-

Page 81: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & "'

H%$(234)('#%$*+:(0*'$&/B()">$%*&'('#%$*+:Y

N"1')*-'O%(((

! 7"

" JPZ:(*>=*B:(1Y8(#*@$(/>*::,/(>*'$&/B('">$%*&'('#%$*+:(^(:=,'/#('"('#%$*+('#*'(9:$:(:";$(N9:('"('">$%*'$(>*'$&/B:5(G"'#(NZ(*&+(;$;"%B(>*'$&/BC

" 234)('#%$*+:(*%$(&"'(9:$+('"('">$%*'$(NZ(>*'$&/B

! e$:h

" 234)('#%$*+:(-$&$%*'$(;"%$(/*/#$(;,::$:5(;"%$(40P

" 3&('#$(*G:$&/$("A(',;$(;9>',?>$M$+('#%$*+:5('#$B(i9:'(,+>$('#$,%(:#*%$("A('#$(@$/'"%(>*&$(=#$&(:"(+",&-C

! 2,;,>*%('"(4346

" jh(6"(&"&I'#%$*+$+(4346(/"%$:('">$%*'$(>*'$&/BY

" Hh(B$:h('#$B(-$&$%*'$(40P5(G9'(,+>$(/"%$:

" 4346(,:(40P(A"%(=#$&(/"%$:(*%$(/#$*?

" 234)(,:(i9:'(*(:'$?("&('#$(=*B('"(434)(^(A"%(=#$&(/"%$:5(*&+(Mc`(+$/"+$5($'/C(*%$(&"'(:"(/#$*?

Page 82: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & "$

H%$(234)('#%$*+:(%$*>>B('#%$*+:(*'(*>>Y

! JPZ:(*>:"(#*@$(/>*::,/(>*'$&/B('">$%*&'('#%$*+:(

" )#$:$(A"%;([-%"9?:\("A('#$(234)(/"#$%$&'(@$/'"%(>*&$('#%$*+:C

! 6,AA$%$&'(/";?*&,$:(#*@$(+,AA$%$&'('$%;,&">"-B

5?=$.=)*"'P&%?;1 !"#$%&'()*"'Q+,&")-1R

O<.-.) V*%?: )#%$*+:

@+4 V*@$A%"&': Y

4*$"='P"*H )#%$*+: Y(/#*&&$>:(Y

2";$(:*Bh(/*>>,&-('#$;('#%$*+:(,:(i9:'(+,:#"&$:'(;*%D$',&-('$%;,&">"-B

4$h('#$B(A,>>('#$(:"A'=*%$(+$A,&,',"&("A('#%$*+:(1P!5(2P5(%$-:8

! 2"(3(*:D$+('#$("%,-,&*>(,&@$&'"%:("A(R,&$(J%*,&(49>','#%$*+,&-H%@,&+(143)8 <9%'"&(2;,'#(143)5(TNP5()$%*8

" )#$B(:*Bh(e$:5('#$B(*%$('#%$*+:CCC

! k9:'(&"'($M*/'>B('#$(:*;$5(&"'(*:(/*?*G>$5(*:([#"%,F"&'*>\(>*'$&/B('">$%*&'('#%$*+:!"#$%$&'(.$/'"%(0*&$()#%$*+:

Page 83: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & "(

]c`("&(*(.$/'"%(0*&$()#%$*+$+234)(4,/%"*%/#,'$/'9%$

+

:

M

%A

+s

M

%A

M

%A

M

%A

4

?/

,s

?/ ?/ ?/ ?/

]c`ZOU4

]c`NZ:

$C-C()$:'P0H

O%';&%B=">S

Z&>,D$(;*&B(;,/%"*%/#,'$/'9%$(,+$*:5(Mc`(="%D:(=$>>(=,'#(234)C

]c`("@$%#$*+(^(+$/"+$%5(ZOU4(^(,:(&*'9%*>>B(*;"%',F$+("@$%(;9>',?>$(234)(@$/'"%(>*&$('#%$*+:C

N@$&(Mc`($M$/9',"&(9&,':(>,D$()$:'P0H(/*&(G$(:#*%$+C

7"(&$$+(A"%(G,&*%B('%*&:>*',"&C(<9'5("A(/"9%:$5(/*&(9:$((<)5($C-C('"(+"(:?$/9>*',"&(}("?',;,F*',"&C

]c`+$/"+$

Page 84: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ")

32H($M'$&:,"&:(A"%(234)(^(A$B$C

! 7"&$(*%$(&$/$::*%B

" ,C$C('#$%$(,:(&"(&$$+(A"%(32H($M'$&:,"&:(:?$/,A,/*>>B(A"%(234)

! 3A(=$(#*@$(#*%+=*%$("%(#,&':'"(+$'$/'(/"&'%">(,&+$?$&+$&/$

" H?*%'(A%";(9:9*>(!PZ365($'/C

! 7"&$(*%$(+$:,%*G>$

" 7"(32H($M'$&:,"&:(:#"9>+(G$(*++$+(A"%(234)(=#,/#(*%$(&"'(*>:"(+$:,%*G>$(A"%(4346C

" 234)(,:(i9:'(*(:'$?("&('#$(=*B('"(4346C

C

! 7$@$%'#$>$::CCC

" !"&'%">(3&+$?$&+$&/$(T,&':

3R(N023R(N02N(N763R0UUP(<ONHX(N760UUP

" !PZ36(G,':(A"%(:'%,+$

! J$&$%,/(J""+(2'9AA

" .*%,*G>$(t>%$-:(W('#%$*+

" O$-,:'$%(D,>>(#,&':

" .0(W(P.V(@*%,*G>$(>$&-'#(@$/'"%:

" 0,-#'=$,-#'()#%$*+,&-

" <*%%,$%:5("'#$%(2B&/#%"&,F*',"&

" !*/#$(!"&'%">W<B?*::(6,%$/',@$:

" 4$::*-$(P*::,&-

C

Page 85: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & "*

234)(.$/'"%(0*&$()#%$*+,&-*&+(N&'$%?%,:$I>$@$>(O$>,*G,>,'B

+

:

M

%A

M

%A

M

%A

M

%A

?/

,s

?/ ?/ ?/ ?/

T8UUUUUU888'V'O%';&%B=">S

O$;$;G$%(>"/D:'$?YRO!Y(j4OY

O$;$;G$%(#"=(=$(#*+('"(*G*&+"&(>"/D:'$?(*'(/#,?5(*&+($@$&(/"%$5(G"9&+*%,$:Y

234)(@$/'"%(>*&$('#%$*+:(&*'9%*>>B(%9&(,&(>"/D:'$?C(H++(/#$/D$%:CCC

<$''$%('#*&(O4)h(:?*',*>(+,@$%:,'BC1*++(+$>*B(A"%('$;?"%*>(+,@$%:,'B8

R%"&'($&+(:#*%$+(G$'=$$&('#%$*+:h(634)C

Page 86: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & "+

V#*'E:('#,:(-9AA(*G"9'([!"#$%$&/$\Y! [!"#$%$&/$\(

" :'*&+*%+('$%;(,&(JPZ(+$:,-&

" [!"#$%$&'\(*:(,&(0H2NO(>,-#'

! 3&:'%9/',"&(!"#$%$&/$

" 49>',?>$(@$/'"%(>*&$('#%$*+:($M$/9',&-(:*;$(,&:'%9/',"&(P!(*'(:*;$(',;$

#$ $ $/,+ (1,&# .2#3/1

" $ $ $ $ %45/1 ).(/5((#) .+ 31.( %&#(#+3,3.'+

! 6*'*(!"#$%$&/$

" 49>',?>$(@$/'"%(>*&$('#%$*+:($M$/9',&-(:*;$5("%(:,;,>*%5(+*'*(;$;"%B(%$A$%$&/$:(*'(:*;$(',;$

(0 ;*BG$(:#*%$(;$;"%B(?"%':Y

Page 87: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ",

6*'*(!"#$%$&/$

! 6$:,-&(/#*>>$&-$(W("??"%'9&,'B

! 6,AA$%$&'(>*&$:(A$'/#,&-

" $M*/'>B(:*;$(+*'*(('%,@,*>(:#*%,&-0

" :'%,+$+(+*'*(,&(:*;$W+,AA$%$&'(/*/#$(>,&$1:8((>$::('%,@,*>C(2#*%$(+$/"+$Y()0<Y(R,>>(0G9AA$%:Y

" 2/*''$%(W(-*'#$%(m(?*/D$',F*',"&

! 3A('#%$*+:(*%$(/"&'%">(A>"=(+,@$%-$+*%$(+*'*(;$;"%B(*++%$::$:(+,@$%-$+Y

! 3A('#%$*+:(*%$(%$*%%*&-$+5(,:(+*'*(/"#$%$&/$(>":'Y

" !*&(,'(G$(%$-*,&$+Y

! )B?$:("A(+*'*(/"#$%$&/$

" NM*/'>B(:*;$(;$;"%B(>"/*',"&(,&(+,AA$%$&'(>*&$('#%$*+:C

" H+i*/$&'(1:'%,+$(L8(;$;"%B(>"/*',"&:

" 2'%,+$I7

" 2/*''$%W-*'#$%

! 2;*>>(0Z):

! V,+$>B(:$?*%*'$+

" 6,AA$%$&'(;$;"%B(*++%$::$:5(:*;$(@*>9$

! $C-C(+,AA$%$&'('#%$*+:5(>"/*>(@*%,*G>$5(:?,>>$+(>""?(/"9&'$%5(:*;$(@*>9$

Page 88: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & ""

3&'%*(@:C(3&'$%(!*/#$>,&$()#%$*+,&-! 2346(@$/'"%:(="%D(G$:'(+$&:$5(:'%,+$IL

29//$::,@$(@$/'"%($>$;$&':(*+i*/$&'(,&(/*/#$(>,&$

! 3&'%*h(

" 7$,-#G"9%,&-(234)('#%$*+:(="%D("&(*+i*/$&'($>$;$&':(,&(/*/#$(>,&$

" U%CCC($>$;$&':(,&(:*;$(/*/#$(>,&$5(G9'(&"'(&$/$::*%,>B(*+i*/$&'C(13&'%*(/*/#$(>,&$(:/*''$%W-*'#$%C8

! 3&'$%h(234)('#%$*+:(="%D("&($&',%$(/*/#$(>,&$

$C-C(9:,&-(',;$(?,?$>,&$+(@$/'"%(,&:'%9/',"&:

$C-C(&$,-#G"%,&-(234)('#%$*+:(="%D("&(&$,-#G"9%,&-(/*/#$(>,&$:

! 4346(?%$A$%:(,&'$%(/*/#$>,&$('#%$*+,&-

! 234)(?%$A$%:(,&'%*S(,&'$%( (>*%-$%(/*/#$(A""'?%,&'0

! P"::,G,>,'Bh(!PZ36(,&A"5(>,D$(.0WP.V(:'%,?(;,&,&-

Page 89: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & "%

V#B(1!"#$%$&'(.$/'"%(0*&$8()#%$*+:*%$(A%,$&+>,$%('"(:"A'=*%$'#*&(2346(@$/'"%(?*%*>>$>

! 3(A"9&+('#,:(:'%*&-$

" H:(*&(">+(@$/'"%(-9B(1J"9>+85(3($M?$/'$+(@$/'"%:('"(G$(=$>>(9&+$%:'""+(*&+(>,D$+

! T"=$@$%

" 29?$%/";?9'$%(-9B:(?%$A$%(4346I:/*>*%

! ;":'(A>$M,G>$5(

! "&>B("&$(+$-%$$("A("?',;,F*',"&

" 4,/%":"A'(6](*&+(U?$&J0(*%$(234)

! 7"'(@$/'"%(^(/";?,>$%(;9:'(/"&@$%'(234)('#%$*+:('"(@$/'"%(,&:'%9/',"&:5(,C$C(;9:'(9&?*%*>>$>,F$

! 4*B(G$(/#,/D$&(*&+($--CCC(G9'('#$%$(B"9(*%$C

! O"G9:'(=%'(!#*&-$

" R,M$+(=,+'#(2346(@$/'"%:(;9:'(G$(%$/"+$+(=#$&(=,+'#(/#*&-$:5($C-C(dLb(y(L_baG

" 2#"%'(@$/'"%:(*%$(,&$AA,/,$&'("&(=,+$(@$/'"%(*%/#,'$/'9%$:

" 234)(>*&$:(,&+,@,+9*>>B(&*%%"=$%(@$/'"%:(

" 234)(/"+$(/*&(%9&("&(43465("%("&(=,+$%(234)(1$:?C(9:,&-('$/#&,f9$:(#$%$8

Page 90: Coherent Vector Lane Threading (SIMT, DIMT, NIMT ... - Par Lab

!"#$"#!% & %!

2'%*=;*&(4,/%"*%/#,'$/'9%$)"(;*'/#(*(dLbIG,'(=,+$(2346(,&('#%"9-#?9'*&+(G$*'(,'(,&(9',>,F*',"&(*&+(2V($*:$

! O9>$("A(a

" dLbIG,'(=,+$(+*'*?*'#

" a([>*&$:\( (a(@$/'"%(>*&$('#%$*+:0

! N*/#(>*&$(Lbc(G,':(=,+$(K(aMgb

" 234)(1"%(;*BG$(634)8

" a(/B/>$:(1"%(=*@$:8(?$%(=*%?(1"%(=*@$A%"&'8(L`('#%$*+:(?$%(=*%?0

! <9'(%$;$;G$%(9:,&-('#$('%,/D:(#$%$(($AA$/',@$>B(b('#%$*+:(W(=*%?(/";?*%*G>$0

! V*%?:(&$$+(&"'('*D$(a(/B/>$:5(/*&(G$(*:(>,''>$(*:(LC

" a(=*%?:(1,C$C(a(['%9$('#%$*+:\8

" !*/#$h()<6