This repository has been archived by the owner on Dec 6, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathqfplib-m3.s
2578 lines (2379 loc) · 83 KB
/
qfplib-m3.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@ Copyright 2016 Mark Owen
@ http://www.quinapalus.com
@ E-mail: [email protected]
@
@ This file is free software: you can redistribute it and/or modify
@ it under the terms of version 2 of the GNU General Public License
@ as published by the Free Software Foundation.
@
@ This file is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@ GNU General Public License for more details.
@
@ You should have received a copy of the GNU General Public License
@ along with this file. If not, see <http://www.gnu.org/licenses/> or
@ write to the Free Software Foundation, Inc., 51 Franklin Street,
@ Fifth Floor, Boston, MA 02110-1301, USA.
.syntax unified
.cpu cortex-m3
.thumb
@ exported symbols
.global qfp_fadd
.global qfp_fsub
.global qfp_fmul
.global qfp_fdiv
.global qfp_fsqrt
.global qfp_fexp
.global qfp_fsin
.global qfp_fcos
.global qfp_fln
.global qfp_ftan
.global qfp_fatan2
@ This macro loads a 32-bit constant n into register rx.
.macro movlong rx,n
movw \rx,#(\n)&0xffff
movt \rx,#((\n)>>16)&0xffff
.endm
@ Addition and subtraction are handled using the following two macros.
@ fadd_s0 is used for adding arguments of like sign and subtracting
@ arguments of unlike sign; fsub_s0 for adding arguments of unlike sign
@ and subtracting arguments of like sign.
@
@ The parameters to the macros are the registers containing the
@ mantissas of the two arguments, the exponents of the two arguments,
@ and a flag indicating if the result is to be negated. In the
@ case of fadd_s0 ye has already been subtracted from xe, with
@ the result in r12.
@
@ fadd_s0 is split into two parts, fadd_s0a and fadd_s0b; likewise fsub_s0.
.macro fadd_s0a xm,ym,xe,ye,neg
rsbs \ye,r12,#31 @ get complementary mantissa shift
bmi 105f @ too great? result is just x
lsl \ye,\ym,\ye @ save sticky bits before...
lsr \ym,\ym,r12 @ ... aligning mantissas
adds r0,\xm,\ym @ the addition
cmp r0,#0x02000000 @ need to increment exponent?
bhs 106f
sub r0,#0x01000000
lsrs r0,#1
adc r0,r0,\xe,lsl#23 @ add in exponent, round mantissa
.if \neg
orr r0,#0x80000000 @ negate result if required
.endif
lsls \ye,#1
it ne @ sticky bits nonzero?
bxne r14 @ finished if so
it cs @ if tie in rounding, make result even
biccs r0,#1
bx r14
.endm
.macro fadd_s0b xm,ym,xe,ye,neg
.balign 4
106: @ case where exponent increments
cmp \xe,#0xfe @ will result become infinity?
bge 7f
lsrs r0,#2
adc r0,r0,\xe,lsl#23 @ add in exponent, round mantissa
.if \neg
orr r0,#0x80000000 @ negate result if required
.endif
cbz \ye,8f @ sticky bits zero?
bx r14 @ otherwise finished
.balign 4
8:
it cs @ if tie in rounding, make result even
biccs r0,#1
bx r14
.balign 4
105: @ x>>y so result is x
lsrs r0,\xm,#1
bfi r0,\xe,#23,#8 @ insert exponent
.if \neg
orr r0,#0x80000000
.endif
bx r14
.balign 4
7: @ infinite result
mov r0,#0x7f800000
.if \neg
orr r0,#0x80000000
.endif
bx r14
.endm
.macro fadd_s0 xm,ym,xe,ye,neg
fadd_s0a \xm,\ym,\xe,\ye,\neg
fadd_s0b \xm,\ym,\xe,\ye,\neg
.endm
.macro fsub_s0a xm,ym,xe,ye,neg
bic \xm,\xm,#0xff000000 @ clear exponent fields
bic \ym,\ym,#0xff000000
orrs \ym,\ym,#0x01000000 @ insert implied 1 in y only; clear carry
sub \ye,\xe,\ye @ difference of exponents
lsr r12,\ym,\ye @ align mantissas
rsb \ye,#32 @ complementary shift
lsl \ye,\ym,\ye @ sticky bits
cbz \ye,8f @ sticky bits=0?
sbcs \xm,r12 @ the subtraction: note carry is clear from above
bmi 109f @ does subtraction cause exponent to change?
.if \neg
add \xe,#256 @ insert negative sign bit above exponent if required
.endif
lsrs \xm,#1 @ round mantissa...
adc r0,\xm,\xe,lsl#23 @ ... and pack result
bx r14
.balign 4
8: @ here sticky bits are 0
subs \xm,r12 @ do subtraction with carry set
bmi 105f @ does subtraction cause exponent to change?
.if \neg
add \xe,#256 @ insert negative sign bit above exponent if required
.endif
lsrs \xm,#1 @ round mantissa...
adc r0,\xm,\xe,lsl#23 @ ... and pack result
it cc @ was rounding tied?
bxcc r14
bic r0,#1 @ if so, round to even
bx r14
.endm
.macro fsub_s0b xm,ym,xe,ye,neg
.balign 4
105: @ here xm is result-0x01000000, exponent in \xe, and sticky bits are zero
cmn \xm,#0x00800000 @ (i.e., result is exact)
blt 6f @ exponent decrease of more than 1?
@ here exponent is decreasing by exactly 1 and result is exact
cmp \xe,#1 @ will result be denormal?
beq 7f
add r0,\xm,\xe,lsl#23 @ pack result
.if \neg
orr r0,#0x80000000 @ negate result if required
.endif
bx r14
.balign 4
109: @ here xm is result-0x01000000, exponent in \xe
cmn \xm,#0x00800000
blt 6f @ exponent decrease of >1?
@ here exponent is decreasing by exactly 1
cmp \xe,#1 @ will result be denormal?
beq 7f
rsbs \ye,#0 @ calculate sticky bits of result
lsls \ye,#1 @ check rounding
adc r0,\xm,\xe,lsl#23 @ round and pack exponent
.if \neg
orr r0,#0x80000000 @ negate result if required
.endif
it cc @ potential rounding tie?
bxcc r14
it ne @ sticky bits<>0?
bxne r14
bic r0,#1 @ make even
bx r14
.balign 4
7: @ here the result is denormal (but the exponent has decreased by only 1)
add r0,\xm,#0x1000000 @ fix result
lsrs r0,#1 @ result is denormal so it is exact
.if \neg
orr r0,#0x80000000 @ negate result if required
.endif
bx r14
.balign 4
6: @ here if cancellation leads to exponent decrease of >1; result must be exact
adds \xm,#0x01000000 @ correct difference now <=0x007fffff
beq 7f @ zero result?
clz \ye,\xm @ 9 => shift up one place for normalised result
subs \ye,#8 @ \ye now has shift up for normal case
subs \xe,\ye @ correct exponent for normalising shift
subs \xe,\xe,#1 @ correct exponent offset
lsl r0,\xm,\ye @ perform normalising shift
ble 5f @ result is denormal?
bfi r0,\xe,#23,#8 @ pack exponent
.if \neg
orr r0,#0x80000000 @ negate result if required
.endif
bx r14
.balign 4
7:
movs r0,#0
bx r14
.balign 4
5:
rsb \xe,\xe,#1 @ result is denormal: get required down shift
lsrs r0,\xe @ generate denormal result
.if \neg
orr r0,#0x80000000 @ negate result if required
.endif
bx r14
.endm
.macro fsub_s0 xm,ym,xe,ye,neg
fsub_s0a \xm,\ym,\xe,\ye,\neg
fsub_s0b \xm,\ym,\xe,\ye,\neg
.endm
.section .text.qfp.faddfsub,"ax",%progbits
.balign 4
.thumb_func
qfp_fadd:
ubfx r2,r0,#23,#8 @ extract exponents
ubfx r3,r1,#23,#8
cbz r3,0f @ y 0/denormal?
10:
cbz r2,1f @ x 0/denormal?
20:
cmp r3,#0xff @ y Inf/NaN?
beq 2f
cmp r2,#0xff @ x Inf/NaN?
beq 3f
lsls r1,#1 @ shift up y, test sign
bcs fadd_m
lsls r0,#1 @ shift up x, test sign
bcs fadd_mp
fadd_pp: @ positive+positive
movs r12,#1 @ insert implied 1:s
bfi r0,r12,#24,#8
bfi r1,r12,#24,#8
subs r12,r2,r3 @ difference of exponents
bmi 104f @ ye>xe?
fadd_s0a r0,r1,r2,r3,0 @ fadd_pp is continued below
0: @ here y 0/denormal
cmp r2,#0x80
bge 30f @ x exponent large? treat y as zero
cmp r2,#0
beq 50f @ x also 0/denormal?
lsr r12,r1,#31 @ save y sign
lsls r1,#1 @ shift out y sign bit
clz r3,r1 @ normalise y
lsls r1,r3
beq 30f @ y zero?
lsrs r1,#8 @ shift back to Q23 position
orr r1,r1,r12,lsl#31 @ restore sign
rsb r3,#8 @ adjust exponent
b 10b @ and continue, treating y as normal value
1: @ here x 0/denormal
cmp r3,#0x80
bge 40f @ y exponent large? treat x as zero
lsr r12,r0,#31 @ save x sign
lsls r0,#1 @ shift out x sign bit
clz r2,r0 @ normalise x
lsls r0,r2
beq 40f @ x zero?
lsrs r0,#8 @ shift back to Q23 position
orr r0,r0,r12,lsl#31 @ restore sign
rsb r2,#8 @ adjust exponent
b 20b @ and continue, treating x as normal value
.balign 4
30: @ here x>>y
cmp r2,#0xff
it ne
bxne r14 @ x finite? return it
3: @ x Inf/NaN
lsls r12,r0,#9 @ x NaN?
it ne
orrne r0,#0x00400000 @ return Inf or modified NaN
bx r14
.balign 4
40: @ here y>>x
mov r0,r1 @ by default return y
cmp r3,#0xff @ y finite? return it
it ne
bxne r14
lsls r12,r0,#9 @ y NaN?
it ne
orrne r0,#0x00400000 @ return Inf or modified NaN
bx r14
.balign 4
2: @ here y is Inf/NaN
lsls r12,r1,#9 @ y NaN?
itt ne
orrne r0,r1,#0x00400000 @ return modified NaN
bxne r14
@ here y Inf
cmp r2,#0xff
itt ne @ x finite?
movne r0,r1
bxne r14 @ return y
lsls r12,r0,#9 @ x NaN?
itt ne
orrne r0,#0x00400000 @ return modified NaN
bxne r14
@ here x Inf, y Inf
teq r0,r1
it eq
bxeq r14 @ same signs? return it
mov r0,#0xff000000 @ otherwise return NaN
orr r0,#0x00c00000
bx r14
.balign 4
50: @ here both x,y 0/denormal
cmp r0,#0x80000000 @ -0+-0?
itt eq
cmpeq r1,#0x80000000 @ return -0
bxeq r14
cmp r0,#0
it mi
rsbmi r0,#0x80000000 @ x negative? make 2's complement
cmp r1,#0
it mi
rsbmi r1,#0x80000000 @ y negative? make 2's complement
adds r0,r1 @ direct addition
it mi
rsbmi r0,#0x80000000 @ convert back to sign-magnitude
bx r14
fadd_s0b r0,r1,r2,r3,0 @ continuation of fadd_pp
.balign 4
104:
subs r12,r3,r2 @ here ye>xe: exchange macro paramters
fadd_s0 r1,r0,r3,r2,0
.balign 4
fsub_m:
lsls r1,#1 @ shift up y, test sign
bcc fadd_mm @ drop into addition
@ drop into fadd_mp
fadd_mp: @ negative+positive
cmp r2,r3 @ which has larger magnitude?
it eq
cmpeq r0,r1
blt fadd_mp_xlty
fsub_s0 r0,r1,r2,r3,1 @ |x|>=|y|, subtract with negative result
.balign 4
fadd_mp_xlty:
fsub_s0 r1,r0,r3,r2,0 @ |x|<|y|, subtract with positive result
.balign 4
fadd_m:
lsls r0,#1 @ x negative?
bcc fadd_pm
fadd_mm: @ negative+negative
movs r12,#1 @ insert implied 1:s
bfi r0,r12,#24,#8
bfi r1,r12,#24,#8
subs r12,r2,r3 @ difference of exponents
bmi 4f @ ye>xe?
fadd_s0 r0,r1,r2,r3,1
.balign 4
4:
subs r12,r3,r2 @ here ye>xe: exchange macro paramters
fadd_s0 r1,r0,r3,r2,1
.balign 4
.thumb_func
qfp_fsub:
ubfx r2,r0,#23,#8 @ extract exponents
ubfx r3,r1,#23,#8
cbz r2,0f @ x 0/denormal?
10:
cbz r3,1f @ y 0/denormal?
20:
cmp r2,#0xff @ x Inf/NaN?
beq 2f
cmp r3,#0xff @ y Inf/NaN?
beq 3f
lsls r0,#1 @ shift up x, test sign
bcs fsub_m
lsls r1,#1 @ shift up y, test sign
bcs fadd_pp @ drop into addition
@ drop into fadd_pm
fadd_pm: @ positive+negative
cmp r2,r3 @ which has larger magnitude?
it eq
cmpeq r0,r1
blt fadd_pm_xlty
fsub_s0a r0,r1,r2,r3,0 @ |x|>=|y|, subtract with positive result
@ fadd_pm is continued below
.balign 4
0: @ here x 0/denormal
b 100f @ continue below
.balign 4
1: @ here y 0/denormal
cmp r2,#0x80
bge 40f @ x exponent large? treat y as zero
lsr r12,r1,#31 @ save y sign
lsls r1,#1 @ shift out y sign bit
clz r3,r1 @ normalise y
lsls r1,r3
beq 40f @ y zero?
lsrs r1,#8 @ shift back to Q23 position
orr r1,r1,r12,lsl#31 @ restore sign
rsb r3,#8 @ adjust exponent
b 20b @ and continue, treating y as normal value
.balign 4
fsub_s0b r0,r1,r2,r3,0 @ continuation of fadd_pm
.balign 4
fadd_pm_xlty:
fsub_s0 r1,r0,r3,r2,1 @ |x|<|y|, subtract with negative result
.balign 4
100:
cmp r3,#0x80
bge 30f @ y exponent large? treat x as zero
cmp r3,#0
beq 50f @ y also 0/denormal?
lsr r12,r0,#31 @ save x sign
lsls r0,#1 @ shift out x sign bit
clz r2,r0 @ normalise x
lsls r0,r2
beq 30f @ x zero?
lsrs r0,#8 @ shift back to Q23 position
orr r0,r0,r12,lsl#31 @ restore sign
rsb r2,#8 @ adjust exponent
b 10b @ and continue, treating x as normal value
.balign 4
30: @ here y>>x
cmp r3,#0xff
itt ne
eorne r0,r1,#0x80000000
bxne r14 @ y finite? return -y
3: @ y Inf/NaN
lsls r12,r1,#9 @ y NaN?
mov r0,r1
itt ne
orrne r0,#0x00400000 @ return modified NaN
bxne r14
eor r0,#0x80000000 @ return -Inf
bx r14
.balign 4
40: @ here x>>y
cmp r2,#0xff @ x finite? return it
it ne
bxne r14
lsls r12,r0,#9 @ x NaN?
it ne
orrne r0,#0x00400000 @ return Inf or modified NaN
bx r14
.balign 4
2: @ here x is Inf/NaN
lsls r12,r0,#9 @ x NaN?
itt ne
orrne r0,#0x00400000 @ return modified NaN
bxne r14
@ here x Inf
cmp r3,#0xff
it ne @ y finite?
bxne r14 @ return x
lsls r12,r1,#9 @ y NaN?
itt ne
orrne r0,r1,#0x00400000 @ return modified NaN
bxne r14
@ here x Inf, y Inf
teq r0,r1
it ne
bxne r14 @ differing signs? return x
mov r0,#0xff000000 @ otherwise return NaN
orr r0,#0x00c00000
bx r14
.balign 4
50: @ here both x,y 0/denormal
eor r1,#0x80000000 @ negate y
cmp r0,#0x80000000 @ -0+-0?
itt eq
cmpeq r1,#0x80000000 @ return -0
bxeq r14
cmp r0,#0
it mi
rsbmi r0,#0x80000000 @ x negative? make 2's complement
cmp r1,#0
it mi
rsbmi r1,#0x80000000 @ y negative? make 2's complement
adds r0,r1 @ direct addition
it mi
rsbmi r0,#0x80000000 @ convert back to sign-magnitude
bx r14
.section .text.qfp.fmul,"ax",%progbits
.balign 4
.thumb_func
qfp_fmul:
eors r2,r0,r1
ands r12,r2,#0x80000000 @ save sign of result
ubfx r3,r1,#23,#8 @ y exponent
cbz r3,fmul_y_0 @ y 0/denormal?
cmp r3,#0xff
beq fmul_y_inf @ y Inf/Nan?
11:
ubfx r2,r0,#23,#8 @ x exponent
cbz r2,fmul_x_0 @ x 0/denormal?
cmp r2,#0xff
beq fmul_x_inf @ x Inf/Nan?
10:
add r3,r3,r2 @ result exponent+254
lsls r0,#9 @ x Q32 with no implied 1
lsls r1,#8 @ y Q31
orr r1,r1,#0x80000000 @ ... with implied 1
umull r2,r0,r0,r1 @ (x-1)y Q63
adds r0,r1 @ xy Q63
bcc 0f @ will result need to be shifted down one place?
subs r3,#126 @ correct exponent (implied 1 present)
ble 5f @ result 0/denormal?
cmp r3,#255
bge 4f @ result Inf?
orrs r2,r2,r0,lsl#24
beq 1f @ sticky bits 0?
lsrs r0,#9 @ pack result
adc r0,r0,r3,lsl#23 @ rounding and exponent
orr r0,r0,r12 @ sign
bx r14
.balign 4
fmul_y_0: @ here y 0/denormal
ubfx r2,r0,#23,#8 @ x exponent
cmp r2,#0xff
beq 111f @ x Inf/NaN?
bics r1,r1,#0x80000000 @ clear sign
beq 8f @ y=0, x finite, so result is zero
clz r3,r1 @ y denormal: make normal...
subs r3,#8
lsls r1,r3
rsb r3,#1 @ ... and fix exponent
b 11b
.balign 4
fmul_x_0: @ x 0/denormal
bics r0,#0x80000000
beq 8f @ x=0? 0 result
clz r2,r0 @ x denormal: make normal...
subs r2,#8
lsls r0,r2
rsb r2,#1 @ ... and fix exponent
b 10b
.balign 4
1: @ sticky bits 0
lsrs r0,#9 @ pack result
adc r0,r0,r3,lsl#23 @ rounding and exponent
it cs
biccs r0,#1 @ round to even
orr r0,r0,r12 @ sign
bx r14
.balign 4
0: @ result must be shifted down one place
subs r3,#128 @ correct exponent (implied 1 absent)
ble 3f @ result 0/denormal?
cmp r3,#254
bge 4f @ result Inf?
orrs r2,r2,r0,lsl#25
beq 2f @ sticky bits 0?
lsrs r0,#8 @ pack result
adc r0,r0,r3,lsl#23
orr r0,r0,r12
bx r14
.balign 4
2: @ sticky bits 0
lsrs r0,#8 @ pack result
adc r0,r0,r3,lsl#23
it cs
biccs r0,#1 @ round to even
orr r0,r0,r12
bx r14
.balign 4
4: @ return +/- Inf
mov r0,#0x7f800000
orr r0,r0,r12
bx r14
.balign 4
5: @ 0/denormal result
subs r3,#1 @ get shift
subs r3,#0 @ set carry
rrxs r0,r0 @ shift in implied 1
rrxs r2,r2
3:
adds r3,#25 @ will become 0?
ble 8f
lsls r1,r0,r3 @ double-length shift
orrs r2,r2,r1
rsb r3,#33
lsrs r0,r3 @ sticky bits
bcc 9f @ not a potential tie?
adds r0,#1 @ rounding...
cmp r2,#0
it eq
biceq r0,#1 @ ... to even
9:
orr r0,r0,r12
bx r14
.balign 4
8:
mov r0,r12 @ 0 result with sign
bx r14
.balign 4
111: @ here y 0/denormal, x Inf/NaN
lsls r2,r0,#9
bne 2f @ x NaN?
bics r1,r1,#0x80000000
bne 3f @ y<>0?
4:
mov r0,#0xff000000 @ y=0, x Inf: return NaN
orr r0,r0,#0x00c00000
bx r14
.balign 4
2:
orr r0,r0,#0x00400000 @ return modified x NaN
bx r14
.balign 4
fmul_y_inf: @ here y Inf/NaN
lsls r3,r1,#9
beq 0f @ y Inf?
orr r0,r1,#0x00400000 @ return modified y NaN
bx r14
.balign 4
0: @ here y Inf
ubfx r2,r0,#23,#8 @ x exponent
cmp r2,#0xff
beq 1f @ x Inf/NaN?
bics r0,#0x80000000
beq 4b @ y=Inf, x=0? return NaN
3:
orr r0,r12,#0x7f800000 @ x<>0, y Inf: return signed Inf
bx r14
.balign 4
1: @ here y Inf, x Inf/NaN
lsls r2,r0,#9
beq 3b @ x Inf? return Inf
b 2f @ x NaN: return modified x NaN
.balign 4
fmul_x_inf: @ x Inf/NaN, y normal
lsls r2,r0,#9
beq 3b @ x Inf? return Inf
2: @ x NaN
orr r0,r0,#0x00400000 @ return modified x NaN
bx r14
@ This macro does the mantissa division operation in two cases.
@ When ds=0, x>=y and quotient of mantissas is 1..2.
@ When ds=1, x< y and quotient of mantissas is 0.5..1.
@ In the first case the result is Q24; in the second, Q25.
.macro div_s0 ds
lsls r0,#8 @ x Q31
lsrs r2,r1,#7+\ds @ approximation to y Q16|Q15
udiv r3,r0,r2 @ first quotient approximation Q31/Q16=Q15 1..2
lsls r0,#7+\ds @ x Q38|Q39
mls r0,r1,r3,r0 @ remainder Q23*Q15=Q38
lsls r0,#2-\ds @ remainder Q40
sdiv r2,r0,r2 @ second part of quotient Q40/Q16|Q15=Q24|Q25
lsls r0,#7+\ds @ Q47|Q48
mls r0,r1,r2,r0 @ updated remainder Q23*Q24|Q25=Q47|Q48
add r2,r2,r3,lsl#9 @ add quotient parts for Q24|Q25 quotient
cmp r0,r1 @ (signed) remainder out of range?
bhs 4f
.endm
.section .text.qfp.fdiv,"ax",%progbits
.balign 4
.thumb_func
qfp_fdiv:
eor r12,r0,r1
lsr r12,#31 @ sign bit of result in r12b0
ubfx r2,r0,#23,#8 @ extract x exponent
ubfx r3,r1,#23,#8 @ extract y exponent
cbz r2,fdiv_x_0 @ x 0/denormal?
cmp r2,#0xff
beq fdiv_x_inf @ x Inf/NaN?
10:
cbz r3,fdiv_y_0 @ y 0/denormal?
cmp r3,#0xff
beq fdiv_y_inf @ y Inf/NaN?
11:
subs r2,r3 @ difference in exponents
movs r3,#1
bfi r0,r3,#23,#9 @ insert implied 1:s
bfi r1,r3,#23,#9
cmp r0,r1 @ mantissa of x < mantissa of y?
blo 1f
@ x>=y
adds r2,#126 @ add exponent offset
cmp r2,#0xfe @ will result be 0/denormal/Inf?
bhs 50f
add r12,r2,r12,lsl#8 @ add in sign bit
div_s0 0 @ perform division
6:
lsrs r0,r2,#1 @ get rounding bit
adc r0,r0,r12,lsl#23 @ round and pack
bx r14 @ there can be no rounding tie if the result is normal (exercise for the reader)
.balign 4
fdiv_x_0: @ x 0/denormal
bics r0,#0x80000000
beq 30f @ x=0?
clz r2,r0 @ x denormal: make normal and fix exponent
subs r2,#8
lsls r0,r2
rsb r2,#1
b 10b
.balign 4
fdiv_y_0: @ y 0/denormal
bics r1,#0x80000000
beq 70f @ y=0, x finite? return +/-Inf
clz r3,r1 @ y denormal: make normal and fix exponent
subs r3,#8
lsls r1,r3
rsb r3,#1
b 11b
.balign 4
1: @ x<y
adds r2,#125 @ add exponent offset
cmp r2,#0xfe @ will result be 0/denormal/Inf?
bhs 60f
add r12,r2,r12,lsl#8 @ add in sign bit
div_s0 1 @ perform division
6:
lsrs r0,r2,#1 @ get rounding bit
adc r0,r0,r12,lsl#23 @ round and pack
bx r14
@ Here we trap the case where the remainder is out of range
@ because the (approximate) division quotient is not correct.
@ By this point the x<y and x>=y cases can be treated together.
.balign 4
4:
blt 5f @ negative remainder?
7:
adds r2,#1 @ increment quotient
subs r0,r1 @ adjust remainder
cmp r0,r1 @ finished?
blo 6b
b 7b
.balign 4
5:
subs r2,#1 @ decrement quotient
adds r0,r1 @ adjust remainder
bpl 6b @ finished?
b 5b
.balign 4
50: @ x>=y, denormal or infinte result
bge 70f @ infinite result?
rsbs r2,#1 @ shift for denormal result
add r12,r2,r12,lsl#31 @ save in r12
div_s0 0 @ perform division
b 6f
.balign 4
60: @ x<y, denormal or infinte result
bge 70f @ infinite result?
rsbs r2,#1 @ shift for denormal result
add r12,r2,r12,lsl#31 @ save in r12
div_s0 1 @ perform division
6:
rsb r3,r12,#33 @ complementary shift
lsls r3,r2,r3 @ bits about to be shifted off (less one)
orrs r0,r3 @ OR sticky bits with remainder
beq 8f @ sticky bits 0?
lsrs r0,r2,r12 @ shift down result
bic r12,#0xff
adc r0,r0,r12 @ round and pack
bx r14
.balign 4
8: @ sticky bits 0: possibly tied rounding case
lsrs r0,r2,r12 @ shift down result
bic r12,#0xff
adc r0,r0,r12
it cs @ tied?
biccs r0,#1 @ round to even
bx r14
@ Here we trap the case where the remainder is out of range
@ when the result is denormal.
.balign 4
4:
blt 5f @ negative remainder?
7:
adds r2,#1 @ increment quotient
subs r0,r1 @ adjust remainder
cmp r0,r1 @ finished?
blo 6b
b 7b
.balign 4
5:
subs r2,#1 @ decrement quotient
adds r0,r1 @ adjust remainder
bpl 6b @ finished?
b 5b
.balign 4
70: @ return signed infinity
lsl r0,r12,#31
orr r0,#0x7f800000
bx r14
.balign 4
30: @ x zero
lsls r3,r1,#1 @ test y
beq 31f @ 0/0? NaN
cmp r3,#0xff000000
bls 32f @ y not NaN?
33:
orr r0,r1,#0x00400000 @ return modified y NaN
bx r14
.balign 4
31: @ return NaN
mov r0,#0xff000000
orr r0,r0,#0x00c00000
bx r14
.balign 4
fdiv_x_inf: @ x Inf/NaN
lsls r2,r0,#9
beq 0f @ x Inf?
orr r0,r0,#0x00400000 @ return modified x NaN
bx r14
.balign 4
0: @ x Inf
cmp r3,#0xff
beq 1f @ y Inf/NaN?
mov r0,#0x7f800000 @ x Inf y finite: return +/-Inf
orr r0,r0,r12,lsl#31
bx r14
.balign 4
1: @ y Inf/NaN
lsls r3,r1,#9
bne 33b @ y NaN?
b 31b @ x Inf y Inf: return NaN
.balign 4
fdiv_y_inf: @ y Inf/NaN
lsls r3,r1,#9
bne 33b @ y NaN?
32:
lsl r0,r12,#31 @ return +/-0
bx r14
@ Square root is calculated using an initial 8-bit approximation from a
@ look-up table which is then refined twice. The look-up table is split
@ into two parts, depending on whether the exponent is odd or even. Seven
@ mantissa bits are used in the look-up. The final remainder is checked
@ and used to correct the approximate result if necessary.
.section .text.qfp.fsqrt,"ax",%progbits
.balign 4
rsqrt_t0:
.byte 181,182,183,183,184,185,186,186,187,188,188,189,190,190,191,192
.byte 192,193,194,194,195,196,196,197,198,198,199,200,200,201,201,202
.byte 203,203,204,205,205,206,206,207,208,208,209,210,210,211,211,212
.byte 213,213,214,214,215,216,216,217,217,218,219,219,220,220,221,221
.byte 222,223,223,224,224,225,225,226,227,227,228,228,229,229,230,230
.byte 231,232,232,233,233,234,234,235,235,236,237,237,238,238,239,239
.byte 240,240,241,241,242,242,243,243,244,244,245,246,246,247,247,248
.byte 248,249,249,250,250,251,251,252,252,253,253,254,254,255,255,255
rsqrt_t1:
.byte 128,129,129,130,130,131,131,132,132,133,133,134,134,135,135,136
.byte 136,136,137,137,138,138,139,139,140,140,141,141,142,142,142,143
.byte 143,144,144,145,145,146,146,146,147,147,148,148,149,149,149,150
.byte 150,151,151,152,152,152,153,153,154,154,155,155,155,156,156,157
.byte 157,157,158,158,159,159,159,160,160,161,161,161,162,162,163,163
.byte 163,164,164,165,165,165,166,166,166,167,167,168,168,168,169,169
.byte 170,170,170,171,171,171,172,172,173,173,173,174,174,174,175,175
.byte 175,176,176,177,177,177,178,178,178,179,179,179,180,180,180,181
.balign 4
.thumb_func
qfp_fsqrt:
bic r2,r0,#0xff000000 @ clear exponent
orr r2,#0x00800000 @ insert implied 1
asrs r1,r0,#24
bmi 1f @ negative argument?
bcs 2f @ odd exponent?
beq 3f @ zero exponent?
9: @ here we have even exponent (e.g. 2<=x<4 Q22)
add r12,r1,#0x3e @ exponent 80->7f, 82->80 etc.
ubfx r3,r2,#16,#7 @ extract 7 mantissa bits
adr r0,rsqrt_t0
ldrb r0,[r0,r3] @ first approximation to result Q7
mul r3,r0,r0 @ approximation squared Q14
sub r1,r2,r3,lsl#8 @ first remainder Q22
lsls r1,#14 @ Q36
sdiv r1,r1,r0 @ first correction*2 Q36/Q7=Q29
lsls r0,#7 @ Q14
add r0,r0,r1,asr#16 @ add in correction Q14
mul r3,r0,r0 @ Q28
rsb r1,r3,r2,lsl#6 @ second remainder Q28
lsls r1,#9 @ Q37
sdiv r1,r1,r0 @ second correction*2 Q37/Q14=Q23
add r0,r1,r0,lsl#10 @ add in correction Q24
mul r3,r0,r0 @ Q48
rsbs r3,r3,r2,lsl#26 @ remainder Q48
bmi 4f @ when approximation is not exact, remainder happens always to be negative
lsrs r0,#1 @ Q23 result
adc r0,r0,r12,lsl#23 @ pack and round (rounding can never tie)
bx r14
.balign 4
4: @ here approximation was too high
subs r0,#1 @ adjust result
mul r3,r0,r0
rsbs r3,r3,r2,lsl#26 @ recompute remainder Q48
bmi 5f @ still incorrect?
lsrs r0,#1 @ pack and round
adc r0,r0,r12,lsl#23
bx r14
.balign 4
5:
subs r0,#1 @ worst-case error is 2ulps at Q24
lsrs r0,#1 @ pack and round
adc r0,r0,r12,lsl#23
bx r14
.balign 4
2: @ here we have odd exponent (e.g. 1<=x<2 Q23)
cmp r1,#0x7f
beq 6f @ +Inf/NaN?
add r12,r1,#0x3f @ exponent 7f->7f, 81->80 etc.
ubfx r3,r2,#16,#7 @ extract 7 mantissa bits
adr r0,rsqrt_t1
ldrb r0,[r0,r3] @ first approximation to result Q7
mul r3,r0,r0 @ approximation squared Q14
sub r1,r2,r3,lsl#9 @ first remainder Q23
lsls r1,#13 @ Q36
sdiv r1,r1,r0 @ first correction*2 Q36/Q7=Q29
lsls r0,#7 @ Q14
add r0,r0,r1,asr#16 @ add in correction Q14
mul r3,r0,r0 @ Q28
rsb r1,r3,r2,lsl#5 @ second remainder Q28
lsls r1,#9 @ Q37
sdiv r1,r1,r0 @ second correction*2 Q37/Q14=Q23
add r0,r1,r0,lsl#10 @ add in correction Q24
mul r3,r0,r0 @ Q48
rsbs r3,r3,r2,lsl#25 @ remainder Q48
bmi 4f @ when approximation is not exact, remainder happens always to be negative
lsrs r0,#1 @ Q23 result
adc r0,r0,r12,lsl#23 @ pack and round (rounding can never tie)
bx r14
.balign 4
4: @ here approximation was too high
subs r0,#1 @ adjust result