forked from romulus0914/MLP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmlp_gpgpu.4.sm_50.ptx
4687 lines (4390 loc) · 207 KB
/
mlp_gpgpu.4.sm_50.ptx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
.version 5.0
.target sm_50
.address_size 64
.visible .entry _Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi(
.param .u64 _Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_0,
.param .u32 _Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_1,
.param .u64 _Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_2,
.param .u32 _Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_3,
.param .u64 _Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_4,
.param .u32 _Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_5,
.param .u32 _Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_6,
.param .u64 _Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_7,
.param .u64 _Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_8,
.param .f32 _Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_9,
.param .f32 _Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_10,
.param .u32 _Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_11
)
.maxntid 64, 1, 1
.minnctapersm 4
{
.reg .pred %p<4>;
.reg .f32 %f<639>;
.reg .b32 %r<29>;
.reg .b64 %rd<90>;
.shared .align 4 .b8 _Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs[1088];
ld.param.u64 %rd17, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_0];
ld.param.u32 %r3, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_1];
ld.param.u64 %rd18, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_2];
ld.param.u32 %r4, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_3];
ld.param.u64 %rd21, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_4];
ld.param.u32 %r5, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_5];
ld.param.u32 %r6, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_6];
ld.param.u64 %rd19, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_7];
ld.param.u64 %rd20, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_8];
ld.param.f32 %f620, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_9];
ld.param.f32 %f621, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_10];
ld.param.u32 %r7, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_11];
cvta.to.global.u64 %rd1, %rd21;
setp.eq.s32 %p1, %r7, 0;
@%p1 bra BB0_2;
cvta.to.global.u64 %rd22, %rd19;
ld.global.f32 %f620, [%rd22];
cvta.to.global.u64 %rd23, %rd20;
ld.global.f32 %f621, [%rd23];
BB0_2:
cvta.to.global.u64 %rd24, %rd18;
cvta.to.global.u64 %rd25, %rd17;
mov.u32 %r9, %ctaid.x;
shl.b32 %r10, %r9, 6;
mov.u32 %r11, %ctaid.y;
shl.b32 %r12, %r11, 4;
mov.u32 %r13, %tid.y;
shl.b32 %r14, %r13, 4;
mov.u32 %r15, %tid.x;
add.s32 %r16, %r14, %r15;
add.s32 %r17, %r16, %r10;
add.s32 %r18, %r12, %r13;
mad.lo.s32 %r19, %r18, %r4, %r15;
mul.wide.s32 %rd26, %r19, 4;
add.s64 %rd89, %rd24, %rd26;
mad.lo.s32 %r20, %r12, %r5, %r17;
cvt.s64.s32 %rd3, %r20;
mul.wide.s32 %rd27, %r20, 4;
add.s64 %rd5, %rd1, %rd27;
mul.wide.s32 %rd28, %r15, 68;
mov.u64 %rd29, _Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs;
add.s64 %rd30, %rd29, %rd28;
mul.wide.s32 %rd31, %r13, 4;
add.s64 %rd6, %rd30, %rd31;
mad.lo.s32 %r21, %r9, 64, %r15;
mad.lo.s32 %r22, %r13, 16, %r21;
mul.wide.s32 %rd32, %r22, 4;
add.s64 %rd88, %rd25, %rd32;
shl.b32 %r23, %r3, 2;
mul.wide.s32 %rd8, %r23, 16;
shl.b32 %r24, %r4, 2;
mul.wide.s32 %rd9, %r24, 4;
mul.wide.s32 %rd10, %r3, 4;
mov.u32 %r28, 0;
mov.f32 %f637, 0f00000000;
mov.f32 %f636, %f637;
mov.f32 %f635, %f637;
mov.f32 %f634, %f637;
mov.f32 %f633, %f637;
mov.f32 %f632, %f637;
mov.f32 %f631, %f637;
mov.f32 %f630, %f637;
mov.f32 %f629, %f637;
mov.f32 %f628, %f637;
mov.f32 %f627, %f637;
mov.f32 %f626, %f637;
mov.f32 %f625, %f637;
mov.f32 %f624, %f637;
mov.f32 %f623, %f637;
mov.f32 %f622, %f637;
BB0_3:
ld.global.f32 %f21, [%rd88];
add.s64 %rd33, %rd88, %rd10;
ld.global.f32 %f22, [%rd33];
add.s64 %rd34, %rd33, %rd10;
ld.global.f32 %f23, [%rd34];
add.s64 %rd13, %rd34, %rd10;
ld.global.f32 %f24, [%rd13];
ld.global.f32 %f77, [%rd89];
st.shared.f32 [%rd6], %f77;
add.s64 %rd35, %rd89, %rd9;
ld.global.f32 %f78, [%rd35];
st.shared.f32 [%rd6+16], %f78;
add.s64 %rd36, %rd35, %rd9;
ld.global.f32 %f79, [%rd36];
st.shared.f32 [%rd6+32], %f79;
add.s64 %rd37, %rd36, %rd9;
ld.global.f32 %f80, [%rd37];
st.shared.f32 [%rd6+48], %f80;
bar.sync 0;
ld.shared.f32 %f81, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs];
fma.rn.vf32 %f82, %f21, %f81, %f637;
ld.shared.f32 %f83, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+4];
fma.rn.vf32 %f84, %f21, %f83, %f636;
ld.shared.f32 %f85, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+8];
fma.rn.vf32 %f86, %f21, %f85, %f635;
ld.shared.f32 %f87, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+12];
fma.rn.vf32 %f88, %f21, %f87, %f634;
ld.shared.f32 %f89, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+16];
fma.rn.vf32 %f90, %f21, %f89, %f633;
ld.shared.f32 %f91, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+20];
fma.rn.vf32 %f92, %f21, %f91, %f632;
ld.shared.f32 %f93, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+24];
fma.rn.vf32 %f94, %f21, %f93, %f631;
ld.shared.f32 %f95, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+28];
fma.rn.vf32 %f96, %f21, %f95, %f630;
ld.shared.f32 %f97, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+32];
fma.rn.vf32 %f98, %f21, %f97, %f629;
ld.shared.f32 %f99, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+36];
fma.rn.vf32 %f100, %f21, %f99, %f628;
ld.shared.f32 %f101, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+40];
fma.rn.vf32 %f102, %f21, %f101, %f627;
ld.shared.f32 %f103, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+44];
fma.rn.vf32 %f104, %f21, %f103, %f626;
ld.shared.f32 %f105, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+48];
fma.rn.vf32 %f106, %f21, %f105, %f625;
ld.shared.f32 %f107, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+52];
fma.rn.vf32 %f108, %f21, %f107, %f624;
ld.shared.f32 %f109, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+56];
fma.rn.vf32 %f110, %f21, %f109, %f623;
ld.shared.f32 %f111, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+60];
fma.rn.vf32 %f112, %f21, %f111, %f622;
add.s64 %rd38, %rd13, %rd10;
ld.shared.f32 %f113, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+68];
fma.rn.vf32 %f114, %f22, %f113, %f82;
ld.shared.f32 %f115, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+72];
fma.rn.vf32 %f116, %f22, %f115, %f84;
ld.shared.f32 %f117, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+76];
fma.rn.vf32 %f118, %f22, %f117, %f86;
ld.shared.f32 %f119, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+80];
fma.rn.vf32 %f120, %f22, %f119, %f88;
ld.shared.f32 %f121, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+84];
fma.rn.vf32 %f122, %f22, %f121, %f90;
ld.shared.f32 %f123, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+88];
fma.rn.vf32 %f124, %f22, %f123, %f92;
ld.shared.f32 %f125, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+92];
fma.rn.vf32 %f126, %f22, %f125, %f94;
ld.shared.f32 %f127, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+96];
fma.rn.vf32 %f128, %f22, %f127, %f96;
ld.shared.f32 %f129, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+100];
fma.rn.vf32 %f130, %f22, %f129, %f98;
ld.shared.f32 %f131, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+104];
fma.rn.vf32 %f132, %f22, %f131, %f100;
ld.shared.f32 %f133, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+108];
fma.rn.vf32 %f134, %f22, %f133, %f102;
ld.shared.f32 %f135, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+112];
fma.rn.vf32 %f136, %f22, %f135, %f104;
ld.shared.f32 %f137, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+116];
fma.rn.vf32 %f138, %f22, %f137, %f106;
ld.shared.f32 %f139, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+120];
fma.rn.vf32 %f140, %f22, %f139, %f108;
ld.shared.f32 %f141, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+124];
fma.rn.vf32 %f142, %f22, %f141, %f110;
ld.shared.f32 %f143, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+128];
fma.rn.vf32 %f144, %f22, %f143, %f112;
add.s64 %rd39, %rd38, %rd10;
ld.shared.f32 %f145, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+136];
fma.rn.vf32 %f146, %f23, %f145, %f114;
ld.shared.f32 %f147, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+140];
fma.rn.vf32 %f148, %f23, %f147, %f116;
ld.shared.f32 %f149, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+144];
fma.rn.vf32 %f150, %f23, %f149, %f118;
ld.shared.f32 %f151, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+148];
fma.rn.vf32 %f152, %f23, %f151, %f120;
ld.shared.f32 %f153, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+152];
fma.rn.vf32 %f154, %f23, %f153, %f122;
ld.shared.f32 %f155, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+156];
fma.rn.vf32 %f156, %f23, %f155, %f124;
ld.shared.f32 %f157, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+160];
fma.rn.vf32 %f158, %f23, %f157, %f126;
ld.shared.f32 %f159, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+164];
fma.rn.vf32 %f160, %f23, %f159, %f128;
ld.shared.f32 %f161, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+168];
fma.rn.vf32 %f162, %f23, %f161, %f130;
ld.shared.f32 %f163, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+172];
fma.rn.vf32 %f164, %f23, %f163, %f132;
ld.shared.f32 %f165, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+176];
fma.rn.vf32 %f166, %f23, %f165, %f134;
ld.shared.f32 %f167, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+180];
fma.rn.vf32 %f168, %f23, %f167, %f136;
ld.shared.f32 %f169, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+184];
fma.rn.vf32 %f170, %f23, %f169, %f138;
ld.shared.f32 %f171, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+188];
fma.rn.vf32 %f172, %f23, %f171, %f140;
ld.shared.f32 %f173, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+192];
fma.rn.vf32 %f174, %f23, %f173, %f142;
ld.shared.f32 %f175, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+196];
fma.rn.vf32 %f176, %f23, %f175, %f144;
add.s64 %rd40, %rd39, %rd10;
ld.shared.f32 %f177, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+204];
fma.rn.vf32 %f178, %f24, %f177, %f146;
ld.shared.f32 %f179, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+208];
fma.rn.vf32 %f180, %f24, %f179, %f148;
ld.shared.f32 %f181, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+212];
fma.rn.vf32 %f182, %f24, %f181, %f150;
ld.shared.f32 %f183, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+216];
fma.rn.vf32 %f184, %f24, %f183, %f152;
ld.shared.f32 %f185, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+220];
fma.rn.vf32 %f186, %f24, %f185, %f154;
ld.shared.f32 %f187, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+224];
fma.rn.vf32 %f188, %f24, %f187, %f156;
ld.shared.f32 %f189, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+228];
fma.rn.vf32 %f190, %f24, %f189, %f158;
ld.shared.f32 %f191, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+232];
fma.rn.vf32 %f192, %f24, %f191, %f160;
ld.shared.f32 %f193, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+236];
fma.rn.vf32 %f194, %f24, %f193, %f162;
ld.shared.f32 %f195, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+240];
fma.rn.vf32 %f196, %f24, %f195, %f164;
ld.shared.f32 %f197, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+244];
fma.rn.vf32 %f198, %f24, %f197, %f166;
ld.shared.f32 %f199, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+248];
fma.rn.vf32 %f200, %f24, %f199, %f168;
ld.shared.f32 %f201, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+252];
fma.rn.vf32 %f202, %f24, %f201, %f170;
ld.shared.f32 %f203, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+256];
fma.rn.vf32 %f204, %f24, %f203, %f172;
ld.shared.f32 %f205, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+260];
fma.rn.vf32 %f206, %f24, %f205, %f174;
ld.shared.f32 %f207, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+264];
fma.rn.vf32 %f208, %f24, %f207, %f176;
add.s64 %rd41, %rd40, %rd10;
ld.shared.f32 %f209, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+272];
ld.global.f32 %f210, [%rd38];
fma.rn.vf32 %f211, %f210, %f209, %f178;
ld.shared.f32 %f212, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+276];
fma.rn.vf32 %f213, %f210, %f212, %f180;
ld.shared.f32 %f214, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+280];
fma.rn.vf32 %f215, %f210, %f214, %f182;
ld.shared.f32 %f216, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+284];
fma.rn.vf32 %f217, %f210, %f216, %f184;
ld.shared.f32 %f218, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+288];
fma.rn.vf32 %f219, %f210, %f218, %f186;
ld.shared.f32 %f220, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+292];
fma.rn.vf32 %f221, %f210, %f220, %f188;
ld.shared.f32 %f222, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+296];
fma.rn.vf32 %f223, %f210, %f222, %f190;
ld.shared.f32 %f224, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+300];
fma.rn.vf32 %f225, %f210, %f224, %f192;
ld.shared.f32 %f226, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+304];
fma.rn.vf32 %f227, %f210, %f226, %f194;
ld.shared.f32 %f228, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+308];
fma.rn.vf32 %f229, %f210, %f228, %f196;
ld.shared.f32 %f230, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+312];
fma.rn.vf32 %f231, %f210, %f230, %f198;
ld.shared.f32 %f232, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+316];
fma.rn.vf32 %f233, %f210, %f232, %f200;
ld.shared.f32 %f234, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+320];
fma.rn.vf32 %f235, %f210, %f234, %f202;
ld.shared.f32 %f236, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+324];
fma.rn.vf32 %f237, %f210, %f236, %f204;
ld.shared.f32 %f238, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+328];
fma.rn.vf32 %f239, %f210, %f238, %f206;
ld.shared.f32 %f240, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+332];
fma.rn.vf32 %f241, %f210, %f240, %f208;
add.s64 %rd42, %rd41, %rd10;
ld.shared.f32 %f242, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+340];
ld.global.f32 %f243, [%rd39];
fma.rn.vf32 %f244, %f243, %f242, %f211;
ld.shared.f32 %f245, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+344];
fma.rn.vf32 %f246, %f243, %f245, %f213;
ld.shared.f32 %f247, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+348];
fma.rn.vf32 %f248, %f243, %f247, %f215;
ld.shared.f32 %f249, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+352];
fma.rn.vf32 %f250, %f243, %f249, %f217;
ld.shared.f32 %f251, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+356];
fma.rn.vf32 %f252, %f243, %f251, %f219;
ld.shared.f32 %f253, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+360];
fma.rn.vf32 %f254, %f243, %f253, %f221;
ld.shared.f32 %f255, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+364];
fma.rn.vf32 %f256, %f243, %f255, %f223;
ld.shared.f32 %f257, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+368];
fma.rn.vf32 %f258, %f243, %f257, %f225;
ld.shared.f32 %f259, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+372];
fma.rn.vf32 %f260, %f243, %f259, %f227;
ld.shared.f32 %f261, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+376];
fma.rn.vf32 %f262, %f243, %f261, %f229;
ld.shared.f32 %f263, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+380];
fma.rn.vf32 %f264, %f243, %f263, %f231;
ld.shared.f32 %f265, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+384];
fma.rn.vf32 %f266, %f243, %f265, %f233;
ld.shared.f32 %f267, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+388];
fma.rn.vf32 %f268, %f243, %f267, %f235;
ld.shared.f32 %f269, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+392];
fma.rn.vf32 %f270, %f243, %f269, %f237;
ld.shared.f32 %f271, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+396];
fma.rn.vf32 %f272, %f243, %f271, %f239;
ld.shared.f32 %f273, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+400];
fma.rn.vf32 %f274, %f243, %f273, %f241;
add.s64 %rd43, %rd42, %rd10;
ld.shared.f32 %f275, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+408];
ld.global.f32 %f276, [%rd40];
fma.rn.vf32 %f277, %f276, %f275, %f244;
ld.shared.f32 %f278, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+412];
fma.rn.vf32 %f279, %f276, %f278, %f246;
ld.shared.f32 %f280, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+416];
fma.rn.vf32 %f281, %f276, %f280, %f248;
ld.shared.f32 %f282, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+420];
fma.rn.vf32 %f283, %f276, %f282, %f250;
ld.shared.f32 %f284, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+424];
fma.rn.vf32 %f285, %f276, %f284, %f252;
ld.shared.f32 %f286, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+428];
fma.rn.vf32 %f287, %f276, %f286, %f254;
ld.shared.f32 %f288, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+432];
fma.rn.vf32 %f289, %f276, %f288, %f256;
ld.shared.f32 %f290, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+436];
fma.rn.vf32 %f291, %f276, %f290, %f258;
ld.shared.f32 %f292, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+440];
fma.rn.vf32 %f293, %f276, %f292, %f260;
ld.shared.f32 %f294, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+444];
fma.rn.vf32 %f295, %f276, %f294, %f262;
ld.shared.f32 %f296, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+448];
fma.rn.vf32 %f297, %f276, %f296, %f264;
ld.shared.f32 %f298, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+452];
fma.rn.vf32 %f299, %f276, %f298, %f266;
ld.shared.f32 %f300, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+456];
fma.rn.vf32 %f301, %f276, %f300, %f268;
ld.shared.f32 %f302, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+460];
fma.rn.vf32 %f303, %f276, %f302, %f270;
ld.shared.f32 %f304, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+464];
fma.rn.vf32 %f305, %f276, %f304, %f272;
ld.shared.f32 %f306, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+468];
fma.rn.vf32 %f307, %f276, %f306, %f274;
add.s64 %rd44, %rd43, %rd10;
ld.shared.f32 %f308, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+476];
ld.global.f32 %f309, [%rd41];
fma.rn.vf32 %f310, %f309, %f308, %f277;
ld.shared.f32 %f311, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+480];
fma.rn.vf32 %f312, %f309, %f311, %f279;
ld.shared.f32 %f313, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+484];
fma.rn.vf32 %f314, %f309, %f313, %f281;
ld.shared.f32 %f315, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+488];
fma.rn.vf32 %f316, %f309, %f315, %f283;
ld.shared.f32 %f317, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+492];
fma.rn.vf32 %f318, %f309, %f317, %f285;
ld.shared.f32 %f319, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+496];
fma.rn.vf32 %f320, %f309, %f319, %f287;
ld.shared.f32 %f321, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+500];
fma.rn.vf32 %f322, %f309, %f321, %f289;
ld.shared.f32 %f323, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+504];
fma.rn.vf32 %f324, %f309, %f323, %f291;
ld.shared.f32 %f325, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+508];
fma.rn.vf32 %f326, %f309, %f325, %f293;
ld.shared.f32 %f327, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+512];
fma.rn.vf32 %f328, %f309, %f327, %f295;
ld.shared.f32 %f329, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+516];
fma.rn.vf32 %f330, %f309, %f329, %f297;
ld.shared.f32 %f331, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+520];
fma.rn.vf32 %f332, %f309, %f331, %f299;
ld.shared.f32 %f333, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+524];
fma.rn.vf32 %f334, %f309, %f333, %f301;
ld.shared.f32 %f335, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+528];
fma.rn.vf32 %f336, %f309, %f335, %f303;
ld.shared.f32 %f337, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+532];
fma.rn.vf32 %f338, %f309, %f337, %f305;
ld.shared.f32 %f339, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+536];
fma.rn.vf32 %f340, %f309, %f339, %f307;
add.s64 %rd45, %rd44, %rd10;
ld.shared.f32 %f341, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+544];
ld.global.f32 %f342, [%rd42];
fma.rn.vf32 %f343, %f342, %f341, %f310;
ld.shared.f32 %f344, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+548];
fma.rn.vf32 %f345, %f342, %f344, %f312;
ld.shared.f32 %f346, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+552];
fma.rn.vf32 %f347, %f342, %f346, %f314;
ld.shared.f32 %f348, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+556];
fma.rn.vf32 %f349, %f342, %f348, %f316;
ld.shared.f32 %f350, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+560];
fma.rn.vf32 %f351, %f342, %f350, %f318;
ld.shared.f32 %f352, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+564];
fma.rn.vf32 %f353, %f342, %f352, %f320;
ld.shared.f32 %f354, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+568];
fma.rn.vf32 %f355, %f342, %f354, %f322;
ld.shared.f32 %f356, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+572];
fma.rn.vf32 %f357, %f342, %f356, %f324;
ld.shared.f32 %f358, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+576];
fma.rn.vf32 %f359, %f342, %f358, %f326;
ld.shared.f32 %f360, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+580];
fma.rn.vf32 %f361, %f342, %f360, %f328;
ld.shared.f32 %f362, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+584];
fma.rn.vf32 %f363, %f342, %f362, %f330;
ld.shared.f32 %f364, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+588];
fma.rn.vf32 %f365, %f342, %f364, %f332;
ld.shared.f32 %f366, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+592];
fma.rn.vf32 %f367, %f342, %f366, %f334;
ld.shared.f32 %f368, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+596];
fma.rn.vf32 %f369, %f342, %f368, %f336;
ld.shared.f32 %f370, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+600];
fma.rn.vf32 %f371, %f342, %f370, %f338;
ld.shared.f32 %f372, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+604];
fma.rn.vf32 %f373, %f342, %f372, %f340;
add.s64 %rd46, %rd45, %rd10;
ld.shared.f32 %f374, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+612];
ld.global.f32 %f375, [%rd43];
fma.rn.vf32 %f376, %f375, %f374, %f343;
ld.shared.f32 %f377, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+616];
fma.rn.vf32 %f378, %f375, %f377, %f345;
ld.shared.f32 %f379, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+620];
fma.rn.vf32 %f380, %f375, %f379, %f347;
ld.shared.f32 %f381, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+624];
fma.rn.vf32 %f382, %f375, %f381, %f349;
ld.shared.f32 %f383, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+628];
fma.rn.vf32 %f384, %f375, %f383, %f351;
ld.shared.f32 %f385, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+632];
fma.rn.vf32 %f386, %f375, %f385, %f353;
ld.shared.f32 %f387, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+636];
fma.rn.vf32 %f388, %f375, %f387, %f355;
ld.shared.f32 %f389, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+640];
fma.rn.vf32 %f390, %f375, %f389, %f357;
ld.shared.f32 %f391, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+644];
fma.rn.vf32 %f392, %f375, %f391, %f359;
ld.shared.f32 %f393, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+648];
fma.rn.vf32 %f394, %f375, %f393, %f361;
ld.shared.f32 %f395, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+652];
fma.rn.vf32 %f396, %f375, %f395, %f363;
ld.shared.f32 %f397, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+656];
fma.rn.vf32 %f398, %f375, %f397, %f365;
ld.shared.f32 %f399, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+660];
fma.rn.vf32 %f400, %f375, %f399, %f367;
ld.shared.f32 %f401, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+664];
fma.rn.vf32 %f402, %f375, %f401, %f369;
ld.shared.f32 %f403, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+668];
fma.rn.vf32 %f404, %f375, %f403, %f371;
ld.shared.f32 %f405, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+672];
fma.rn.vf32 %f406, %f375, %f405, %f373;
add.s64 %rd47, %rd46, %rd10;
ld.shared.f32 %f407, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+680];
ld.global.f32 %f408, [%rd44];
fma.rn.vf32 %f409, %f408, %f407, %f376;
ld.shared.f32 %f410, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+684];
fma.rn.vf32 %f411, %f408, %f410, %f378;
ld.shared.f32 %f412, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+688];
fma.rn.vf32 %f413, %f408, %f412, %f380;
ld.shared.f32 %f414, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+692];
fma.rn.vf32 %f415, %f408, %f414, %f382;
ld.shared.f32 %f416, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+696];
fma.rn.vf32 %f417, %f408, %f416, %f384;
ld.shared.f32 %f418, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+700];
fma.rn.vf32 %f419, %f408, %f418, %f386;
ld.shared.f32 %f420, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+704];
fma.rn.vf32 %f421, %f408, %f420, %f388;
ld.shared.f32 %f422, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+708];
fma.rn.vf32 %f423, %f408, %f422, %f390;
ld.shared.f32 %f424, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+712];
fma.rn.vf32 %f425, %f408, %f424, %f392;
ld.shared.f32 %f426, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+716];
fma.rn.vf32 %f427, %f408, %f426, %f394;
ld.shared.f32 %f428, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+720];
fma.rn.vf32 %f429, %f408, %f428, %f396;
ld.shared.f32 %f430, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+724];
fma.rn.vf32 %f431, %f408, %f430, %f398;
ld.shared.f32 %f432, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+728];
fma.rn.vf32 %f433, %f408, %f432, %f400;
ld.shared.f32 %f434, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+732];
fma.rn.vf32 %f435, %f408, %f434, %f402;
ld.shared.f32 %f436, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+736];
fma.rn.vf32 %f437, %f408, %f436, %f404;
ld.shared.f32 %f438, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+740];
fma.rn.vf32 %f439, %f408, %f438, %f406;
add.s64 %rd48, %rd47, %rd10;
ld.shared.f32 %f440, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+748];
ld.global.f32 %f441, [%rd45];
fma.rn.vf32 %f442, %f441, %f440, %f409;
ld.shared.f32 %f443, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+752];
fma.rn.vf32 %f444, %f441, %f443, %f411;
ld.shared.f32 %f445, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+756];
fma.rn.vf32 %f446, %f441, %f445, %f413;
ld.shared.f32 %f447, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+760];
fma.rn.vf32 %f448, %f441, %f447, %f415;
ld.shared.f32 %f449, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+764];
fma.rn.vf32 %f450, %f441, %f449, %f417;
ld.shared.f32 %f451, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+768];
fma.rn.vf32 %f452, %f441, %f451, %f419;
ld.shared.f32 %f453, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+772];
fma.rn.vf32 %f454, %f441, %f453, %f421;
ld.shared.f32 %f455, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+776];
fma.rn.vf32 %f456, %f441, %f455, %f423;
ld.shared.f32 %f457, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+780];
fma.rn.vf32 %f458, %f441, %f457, %f425;
ld.shared.f32 %f459, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+784];
fma.rn.vf32 %f460, %f441, %f459, %f427;
ld.shared.f32 %f461, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+788];
fma.rn.vf32 %f462, %f441, %f461, %f429;
ld.shared.f32 %f463, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+792];
fma.rn.vf32 %f464, %f441, %f463, %f431;
ld.shared.f32 %f465, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+796];
fma.rn.vf32 %f466, %f441, %f465, %f433;
ld.shared.f32 %f467, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+800];
fma.rn.vf32 %f468, %f441, %f467, %f435;
ld.shared.f32 %f469, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+804];
fma.rn.vf32 %f470, %f441, %f469, %f437;
ld.shared.f32 %f471, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+808];
fma.rn.vf32 %f472, %f441, %f471, %f439;
add.s64 %rd49, %rd48, %rd10;
ld.shared.f32 %f473, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+816];
ld.global.f32 %f474, [%rd46];
fma.rn.vf32 %f475, %f474, %f473, %f442;
ld.shared.f32 %f476, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+820];
fma.rn.vf32 %f477, %f474, %f476, %f444;
ld.shared.f32 %f478, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+824];
fma.rn.vf32 %f479, %f474, %f478, %f446;
ld.shared.f32 %f480, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+828];
fma.rn.vf32 %f481, %f474, %f480, %f448;
ld.shared.f32 %f482, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+832];
fma.rn.vf32 %f483, %f474, %f482, %f450;
ld.shared.f32 %f484, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+836];
fma.rn.vf32 %f485, %f474, %f484, %f452;
ld.shared.f32 %f486, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+840];
fma.rn.vf32 %f487, %f474, %f486, %f454;
ld.shared.f32 %f488, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+844];
fma.rn.vf32 %f489, %f474, %f488, %f456;
ld.shared.f32 %f490, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+848];
fma.rn.vf32 %f491, %f474, %f490, %f458;
ld.shared.f32 %f492, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+852];
fma.rn.vf32 %f493, %f474, %f492, %f460;
ld.shared.f32 %f494, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+856];
fma.rn.vf32 %f495, %f474, %f494, %f462;
ld.shared.f32 %f496, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+860];
fma.rn.vf32 %f497, %f474, %f496, %f464;
ld.shared.f32 %f498, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+864];
fma.rn.vf32 %f499, %f474, %f498, %f466;
ld.shared.f32 %f500, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+868];
fma.rn.vf32 %f501, %f474, %f500, %f468;
ld.shared.f32 %f502, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+872];
fma.rn.vf32 %f503, %f474, %f502, %f470;
ld.shared.f32 %f504, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+876];
fma.rn.vf32 %f505, %f474, %f504, %f472;
ld.shared.f32 %f506, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+884];
ld.global.f32 %f507, [%rd47];
fma.rn.vf32 %f508, %f507, %f506, %f475;
ld.shared.f32 %f509, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+888];
fma.rn.vf32 %f510, %f507, %f509, %f477;
ld.shared.f32 %f511, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+892];
fma.rn.vf32 %f512, %f507, %f511, %f479;
ld.shared.f32 %f513, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+896];
fma.rn.vf32 %f514, %f507, %f513, %f481;
ld.shared.f32 %f515, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+900];
fma.rn.vf32 %f516, %f507, %f515, %f483;
ld.shared.f32 %f517, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+904];
fma.rn.vf32 %f518, %f507, %f517, %f485;
ld.shared.f32 %f519, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+908];
fma.rn.vf32 %f520, %f507, %f519, %f487;
ld.shared.f32 %f521, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+912];
fma.rn.vf32 %f522, %f507, %f521, %f489;
ld.shared.f32 %f523, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+916];
fma.rn.vf32 %f524, %f507, %f523, %f491;
ld.shared.f32 %f525, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+920];
fma.rn.vf32 %f526, %f507, %f525, %f493;
ld.shared.f32 %f527, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+924];
fma.rn.vf32 %f528, %f507, %f527, %f495;
ld.shared.f32 %f529, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+928];
fma.rn.vf32 %f530, %f507, %f529, %f497;
ld.shared.f32 %f531, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+932];
fma.rn.vf32 %f532, %f507, %f531, %f499;
ld.shared.f32 %f533, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+936];
fma.rn.vf32 %f534, %f507, %f533, %f501;
ld.shared.f32 %f535, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+940];
fma.rn.vf32 %f536, %f507, %f535, %f503;
ld.shared.f32 %f537, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+944];
fma.rn.vf32 %f538, %f507, %f537, %f505;
ld.shared.f32 %f539, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+952];
ld.global.f32 %f540, [%rd48];
fma.rn.vf32 %f541, %f540, %f539, %f508;
ld.shared.f32 %f542, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+956];
fma.rn.vf32 %f543, %f540, %f542, %f510;
ld.shared.f32 %f544, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+960];
fma.rn.vf32 %f545, %f540, %f544, %f512;
ld.shared.f32 %f546, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+964];
fma.rn.vf32 %f547, %f540, %f546, %f514;
ld.shared.f32 %f548, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+968];
fma.rn.vf32 %f549, %f540, %f548, %f516;
ld.shared.f32 %f550, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+972];
fma.rn.vf32 %f551, %f540, %f550, %f518;
ld.shared.f32 %f552, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+976];
fma.rn.vf32 %f553, %f540, %f552, %f520;
ld.shared.f32 %f554, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+980];
fma.rn.vf32 %f555, %f540, %f554, %f522;
ld.shared.f32 %f556, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+984];
fma.rn.vf32 %f557, %f540, %f556, %f524;
ld.shared.f32 %f558, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+988];
fma.rn.vf32 %f559, %f540, %f558, %f526;
ld.shared.f32 %f560, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+992];
fma.rn.vf32 %f561, %f540, %f560, %f528;
ld.shared.f32 %f562, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+996];
fma.rn.vf32 %f563, %f540, %f562, %f530;
ld.shared.f32 %f564, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+1000];
fma.rn.vf32 %f565, %f540, %f564, %f532;
ld.shared.f32 %f566, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+1004];
fma.rn.vf32 %f567, %f540, %f566, %f534;
ld.shared.f32 %f568, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+1008];
fma.rn.vf32 %f569, %f540, %f568, %f536;
ld.shared.f32 %f570, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+1012];
fma.rn.vf32 %f571, %f540, %f570, %f538;
ld.shared.f32 %f572, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+1020];
ld.global.f32 %f573, [%rd49];
fma.rn.vf32 %f637, %f573, %f572, %f541;
ld.shared.f32 %f574, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+1024];
fma.rn.vf32 %f636, %f573, %f574, %f543;
ld.shared.f32 %f575, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+1028];
fma.rn.vf32 %f635, %f573, %f575, %f545;
ld.shared.f32 %f576, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+1032];
fma.rn.vf32 %f634, %f573, %f576, %f547;
ld.shared.f32 %f577, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+1036];
fma.rn.vf32 %f633, %f573, %f577, %f549;
ld.shared.f32 %f578, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+1040];
fma.rn.vf32 %f632, %f573, %f578, %f551;
ld.shared.f32 %f579, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+1044];
fma.rn.vf32 %f631, %f573, %f579, %f553;
ld.shared.f32 %f580, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+1048];
fma.rn.vf32 %f630, %f573, %f580, %f555;
ld.shared.f32 %f581, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+1052];
fma.rn.vf32 %f629, %f573, %f581, %f557;
ld.shared.f32 %f582, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+1056];
fma.rn.vf32 %f628, %f573, %f582, %f559;
ld.shared.f32 %f583, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+1060];
fma.rn.vf32 %f627, %f573, %f583, %f561;
ld.shared.f32 %f584, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+1064];
fma.rn.vf32 %f626, %f573, %f584, %f563;
ld.shared.f32 %f585, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+1068];
fma.rn.vf32 %f625, %f573, %f585, %f565;
ld.shared.f32 %f586, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+1072];
fma.rn.vf32 %f624, %f573, %f586, %f567;
ld.shared.f32 %f587, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+1076];
fma.rn.vf32 %f623, %f573, %f587, %f569;
ld.shared.f32 %f588, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi$__cuda_local_var_22761_39_non_const_bs+1080];
fma.rn.vf32 %f622, %f573, %f588, %f571;
bar.sync 0;
add.s64 %rd89, %rd89, 64;
add.s64 %rd88, %rd88, %rd8;
add.s32 %r28, %r28, 16;
setp.lt.s32 %p2, %r28, %r6;
@%p2 bra BB0_3;
ld.param.u64 %rd87, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_4];
cvta.to.global.u64 %rd86, %rd87;
ld.param.u32 %r25, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_5];
setp.neu.f32 %p3, %f621, 0f00000000;
mul.f32 %f41, %f620, %f637;
cvt.s64.s32 %rd50, %r25;
add.s64 %rd51, %rd3, %rd50;
mul.f32 %f42, %f620, %f636;
mul.f32 %f43, %f620, %f635;
mul.f32 %f44, %f620, %f634;
mul.f32 %f45, %f620, %f633;
mul.f32 %f46, %f620, %f632;
mul.f32 %f47, %f620, %f631;
mul.f32 %f48, %f620, %f630;
mul.f32 %f49, %f620, %f629;
mul.f32 %f50, %f620, %f628;
mul.f32 %f51, %f620, %f627;
mul.f32 %f52, %f620, %f626;
mul.f32 %f53, %f620, %f625;
mul.f32 %f54, %f620, %f624;
mul.f32 %f55, %f620, %f623;
mul.wide.s32 %rd52, %r25, 14;
add.s64 %rd53, %rd52, %rd51;
shl.b64 %rd54, %rd53, 2;
add.s64 %rd16, %rd86, %rd54;
mul.f32 %f638, %f620, %f622;
@%p3 bra BB0_6;
bra.uni BB0_5;
BB0_6:
ld.param.u32 %r27, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_5];
ld.global.f32 %f589, [%rd5];
fma.rn.vf32 %f590, %f621, %f589, %f41;
st.global.f32 [%rd5], %f590;
mul.wide.s32 %rd70, %r27, 4;
add.s64 %rd71, %rd5, %rd70;
ld.global.f32 %f591, [%rd71];
fma.rn.vf32 %f592, %f621, %f591, %f42;
st.global.f32 [%rd71], %f592;
add.s64 %rd72, %rd71, %rd70;
ld.global.f32 %f593, [%rd72];
fma.rn.vf32 %f594, %f621, %f593, %f43;
st.global.f32 [%rd72], %f594;
add.s64 %rd73, %rd72, %rd70;
ld.global.f32 %f595, [%rd73];
fma.rn.vf32 %f596, %f621, %f595, %f44;
st.global.f32 [%rd73], %f596;
add.s64 %rd74, %rd73, %rd70;
ld.global.f32 %f597, [%rd74];
fma.rn.vf32 %f598, %f621, %f597, %f45;
st.global.f32 [%rd74], %f598;
add.s64 %rd75, %rd74, %rd70;
ld.global.f32 %f599, [%rd75];
fma.rn.vf32 %f600, %f621, %f599, %f46;
st.global.f32 [%rd75], %f600;
add.s64 %rd76, %rd75, %rd70;
ld.global.f32 %f601, [%rd76];
fma.rn.vf32 %f602, %f621, %f601, %f47;
st.global.f32 [%rd76], %f602;
add.s64 %rd77, %rd76, %rd70;
ld.global.f32 %f603, [%rd77];
fma.rn.vf32 %f604, %f621, %f603, %f48;
st.global.f32 [%rd77], %f604;
add.s64 %rd78, %rd77, %rd70;
ld.global.f32 %f605, [%rd78];
fma.rn.vf32 %f606, %f621, %f605, %f49;
st.global.f32 [%rd78], %f606;
add.s64 %rd79, %rd78, %rd70;
ld.global.f32 %f607, [%rd79];
fma.rn.vf32 %f608, %f621, %f607, %f50;
st.global.f32 [%rd79], %f608;
add.s64 %rd80, %rd79, %rd70;
ld.global.f32 %f609, [%rd80];
fma.rn.vf32 %f610, %f621, %f609, %f51;
st.global.f32 [%rd80], %f610;
add.s64 %rd81, %rd80, %rd70;
ld.global.f32 %f611, [%rd81];
fma.rn.vf32 %f612, %f621, %f611, %f52;
st.global.f32 [%rd81], %f612;
add.s64 %rd82, %rd81, %rd70;
ld.global.f32 %f613, [%rd82];
fma.rn.vf32 %f614, %f621, %f613, %f53;
st.global.f32 [%rd82], %f614;
add.s64 %rd83, %rd82, %rd70;
ld.global.f32 %f615, [%rd83];
fma.rn.vf32 %f616, %f621, %f615, %f54;
st.global.f32 [%rd83], %f616;
add.s64 %rd84, %rd83, %rd70;
ld.global.f32 %f617, [%rd84];
fma.rn.vf32 %f618, %f621, %f617, %f55;
st.global.f32 [%rd84], %f618;
add.s64 %rd85, %rd84, %rd70;
ld.global.f32 %f619, [%rd85];
fma.rn.vf32 %f638, %f621, %f619, %f638;
bra.uni BB0_7;
BB0_5:
ld.param.u32 %r26, [_Z12sgemmNN_corePKfiS0_iPfiiS0_S0_ffi_param_5];
st.global.f32 [%rd5], %f41;
mul.wide.s32 %rd55, %r26, 4;
add.s64 %rd56, %rd5, %rd55;
st.global.f32 [%rd56], %f42;
add.s64 %rd57, %rd56, %rd55;
st.global.f32 [%rd57], %f43;
add.s64 %rd58, %rd57, %rd55;
st.global.f32 [%rd58], %f44;
add.s64 %rd59, %rd58, %rd55;
st.global.f32 [%rd59], %f45;
add.s64 %rd60, %rd59, %rd55;
st.global.f32 [%rd60], %f46;
add.s64 %rd61, %rd60, %rd55;
st.global.f32 [%rd61], %f47;
add.s64 %rd62, %rd61, %rd55;
st.global.f32 [%rd62], %f48;
add.s64 %rd63, %rd62, %rd55;
st.global.f32 [%rd63], %f49;
add.s64 %rd64, %rd63, %rd55;
st.global.f32 [%rd64], %f50;
add.s64 %rd65, %rd64, %rd55;
st.global.f32 [%rd65], %f51;
add.s64 %rd66, %rd65, %rd55;
st.global.f32 [%rd66], %f52;
add.s64 %rd67, %rd66, %rd55;
st.global.f32 [%rd67], %f53;
add.s64 %rd68, %rd67, %rd55;
st.global.f32 [%rd68], %f54;
add.s64 %rd69, %rd68, %rd55;
st.global.f32 [%rd69], %f55;
BB0_7:
st.global.f32 [%rd16], %f638;
ret;
}
.visible .entry _Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi(
.param .u64 _Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_0,
.param .u32 _Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_1,
.param .u64 _Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_2,
.param .u32 _Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_3,
.param .u64 _Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_4,
.param .u32 _Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_5,
.param .u32 _Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_6,
.param .u32 _Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_7,
.param .u32 _Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_8,
.param .u64 _Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_9,
.param .u64 _Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_10,
.param .f32 _Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_11,
.param .f32 _Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_12,
.param .u32 _Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_13
)
.maxntid 64, 1, 1
.minnctapersm 4
{
.reg .pred %p<52>;
.reg .f32 %f<786>;
.reg .b32 %r<122>;
.reg .b64 %rd<263>;
.shared .align 4 .b8 _Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi$__cuda_local_var_22864_39_non_const_bs[1088];
ld.param.u64 %rd47, [_Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_0];
ld.param.u32 %r7, [_Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_1];
ld.param.u64 %rd48, [_Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_2];
ld.param.u32 %r8, [_Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_3];
ld.param.u64 %rd49, [_Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_4];
ld.param.u32 %r9, [_Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_5];
ld.param.u32 %r10, [_Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_6];
ld.param.u32 %r11, [_Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_7];
ld.param.u32 %r12, [_Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_8];
ld.param.u64 %rd50, [_Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_9];
ld.param.u64 %rd51, [_Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_10];
ld.param.f32 %f768, [_Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_11];
ld.param.f32 %f769, [_Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_12];
ld.param.u32 %r13, [_Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi_param_13];
setp.eq.s32 %p1, %r13, 0;
@%p1 bra BB1_2;
cvta.to.global.u64 %rd52, %rd50;
ld.global.f32 %f768, [%rd52];
cvta.to.global.u64 %rd53, %rd51;
ld.global.f32 %f769, [%rd53];
BB1_2:
cvta.to.global.u64 %rd54, %rd49;
cvta.to.global.u64 %rd55, %rd48;
cvta.to.global.u64 %rd56, %rd47;
mov.u32 %r15, %ctaid.x;
shl.b32 %r16, %r15, 6;
mov.u32 %r17, %ctaid.y;
shl.b32 %r1, %r17, 4;
mov.u32 %r18, %tid.y;
shl.b32 %r19, %r18, 4;
mov.u32 %r20, %tid.x;
add.s32 %r21, %r19, %r20;
add.s32 %r22, %r16, %r21;
cvt.s64.s32 %rd57, %r22;
add.s32 %r23, %r1, %r18;
mad.lo.s32 %r24, %r23, %r8, %r20;
mul.wide.s32 %rd58, %r24, 4;
add.s64 %rd262, %rd55, %rd58;
mad.lo.s32 %r25, %r1, %r9, %r22;
mul.wide.s32 %rd59, %r25, 4;
add.s64 %rd2, %rd54, %rd59;
add.s32 %r26, %r16, 64;
setp.le.s32 %p2, %r26, %r10;
setp.lt.s32 %p3, %r22, %r10;
or.pred %p4, %p2, %p3;
neg.s32 %r27, %r21;
cvt.s64.s32 %rd60, %r27;
selp.b64 %rd61, 0, %rd60, %p4;
add.s64 %rd62, %rd57, %rd61;
shl.b64 %rd63, %rd62, 2;
add.s64 %rd260, %rd56, %rd63;
mov.u32 %r120, 0;
mov.f32 %f785, 0f00000000;
mov.f32 %f784, %f785;
mov.f32 %f783, %f785;
mov.f32 %f782, %f785;
mov.f32 %f781, %f785;
mov.f32 %f780, %f785;
mov.f32 %f779, %f785;
mov.f32 %f778, %f785;
mov.f32 %f777, %f785;
mov.f32 %f776, %f785;
mov.f32 %f775, %f785;
mov.f32 %f774, %f785;
mov.f32 %f773, %f785;
mov.f32 %f772, %f785;
mov.f32 %f771, %f785;
mov.f32 %f770, %f785;
mov.u64 %rd258, %rd260;
BB1_3:
mov.u64 %rd4, %rd260;
mov.u64 %rd5, %rd258;
add.s32 %r28, %r1, 16;
setp.lt.s32 %p5, %r28, %r11;
shl.b32 %r29, %r8, 2;
mul.wide.s32 %rd64, %r29, 4;
add.s64 %rd7, %rd262, %rd64;
@%p5 bra BB1_13;
bra.uni BB1_4;
BB1_13:
ld.global.f32 %f139, [%rd262];
mul.wide.s32 %rd89, %r20, 68;
mov.u64 %rd90, _Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi$__cuda_local_var_22864_39_non_const_bs;
add.s64 %rd91, %rd90, %rd89;
mul.wide.s32 %rd92, %r18, 4;
add.s64 %rd93, %rd91, %rd92;
st.shared.f32 [%rd93], %f139;
ld.global.f32 %f140, [%rd7];
st.shared.f32 [%rd93+16], %f140;
add.s64 %rd95, %rd7, %rd64;
ld.global.f32 %f141, [%rd95];
st.shared.f32 [%rd93+32], %f141;
add.s64 %rd96, %rd95, %rd64;
ld.global.f32 %f142, [%rd96];
st.shared.f32 [%rd93+48], %f142;
bra.uni BB1_14;
BB1_4:
add.s32 %r31, %r120, %r20;
setp.ge.s32 %p6, %r31, %r12;
@%p6 bra BB1_14;
setp.ge.s32 %p7, %r23, %r11;
@%p7 bra BB1_7;
ld.global.f32 %f135, [%rd262];
mul.wide.s32 %rd65, %r20, 68;
mov.u64 %rd66, _Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi$__cuda_local_var_22864_39_non_const_bs;
add.s64 %rd67, %rd66, %rd65;
mul.wide.s32 %rd68, %r18, 4;
add.s64 %rd69, %rd67, %rd68;
st.shared.f32 [%rd69], %f135;
BB1_7:
add.s32 %r38, %r23, 4;
setp.ge.s32 %p8, %r38, %r11;
@%p8 bra BB1_9;
ld.global.f32 %f136, [%rd7];
mul.wide.s32 %rd70, %r20, 68;
mov.u64 %rd71, _Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi$__cuda_local_var_22864_39_non_const_bs;
add.s64 %rd72, %rd71, %rd70;
mul.wide.s32 %rd73, %r18, 4;
add.s64 %rd74, %rd72, %rd73;
st.shared.f32 [%rd74+16], %f136;
BB1_9:
add.s32 %r43, %r23, 8;
setp.ge.s32 %p9, %r43, %r11;
@%p9 bra BB1_11;
shl.b32 %r44, %r8, 3;
mul.wide.s32 %rd75, %r44, 4;
add.s64 %rd76, %rd262, %rd75;
ld.global.f32 %f137, [%rd76];
mul.wide.s32 %rd77, %r20, 68;
mov.u64 %rd78, _Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi$__cuda_local_var_22864_39_non_const_bs;
add.s64 %rd79, %rd78, %rd77;
mul.wide.s32 %rd80, %r18, 4;
add.s64 %rd81, %rd79, %rd80;
st.shared.f32 [%rd81+32], %f137;
BB1_11:
add.s32 %r49, %r23, 12;
setp.ge.s32 %p10, %r49, %r11;
@%p10 bra BB1_14;
mul.lo.s32 %r50, %r8, 12;
mul.wide.s32 %rd82, %r50, 4;
add.s64 %rd83, %rd262, %rd82;
ld.global.f32 %f138, [%rd83];
mul.wide.s32 %rd84, %r20, 68;
mov.u64 %rd85, _Z16gen_sgemmNN_corePKfiS0_iPfiiiiS0_S0_ffi$__cuda_local_var_22864_39_non_const_bs;
add.s64 %rd86, %rd85, %rd84;
mul.wide.s32 %rd87, %r18, 4;
add.s64 %rd88, %rd86, %rd87;