This repository has been archived by the owner on Jul 6, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlecture5.tex
92 lines (72 loc) · 4.08 KB
/
lecture5.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
\chapter{Floating Point}
\section{IEEE-754 Floating Point Standard}
So far we can represent \(2^N\) integers, but what about very small and very large numbers? The goal of establishing a floating point standard is to keep \emph{high precision} and keep a decimal encoding that is compatible with two's complement.
\subsection{Floating Point Components (32 Bits)}
\begin{itemize}
\item Sign (1 bit): 0 means positive, 1 means negative
\item Exponent (8 bits): Biased notation $\implies$ smallest number is written as all zeros
\begin{itemize}
\item Bias = $-(2^{N-1}-1)$
\end{itemize}
\item Significand/Mantissa (23 bits): In normalized form, the implicit 1 on the left of the binary point is not stored
\end{itemize}
\subsection{Floating Point Conversion}
\begin{description}
\item[Normalized Floats:] \(\text{Value} = (-1)^{\textit{Sign}} * 2^{\textit{Exponent + Bias}} * 1.\textit{Significand}_2\)
\item[Denormalized Floats:] \(\text{Value} = (-1)^{\textit{Sign}} * 2^{\textit{Exponent + Bias + 1}} * 0.\textit{Significand}_2\)
\end{description}
\subsection{Range of Floating Point Values}
\begin{tabular}{ |c|c|c| }
\hline
\textbf{Exponent} & \textbf{Significand} & \textbf{Meaning} \\
\hline
0 & 0 & \text{Zero} \\
0 & \text{Nonzero} & \text{Denorm} \\
1-254 & \text{Anything} & \text{Normal} \\
255 & 0 & \text{Infinity} \\
255 & \text{Nonzero} & \text{NaN} \\
\hline
\end{tabular}
\begin{itemize}
\item Positive range: \([2^{-126}, (2 - 2^{-23}) \times 2^{127}\)
\item Negative range: \([-(2 - 2^{-23}) \times 2^{127}, -2^{-126}]\)
\item Overflow: Magnitude of value is too \emph{large} to represent.
\item Underflow: Magnitude of value is too \emph{small} to represent.
\end{itemize}
\begin{description}
\item[Smallest positive value (denorm):] exponent = 0; significand = last bit 1
\((-1)^0 \times 0.0000....0001 \times 2^{0 - 127 + 1} = 2^{-23} \times 2^{-126}\)
\item[Smallest positive norm value:] exponent = 1; significand = last bit 1
\((-1)^0 \times 1.0000....0000 \times 2^{1-127} = 2^{-126}\)
\item[Largest positive value (normal):] exponent = 254 (all 1s); significand = all 1s
\((-1)^0 \times 1.1111...1111 \times 2^{254 - 127} = (1 + (2^{-1} + 2^{-2} + ... + 2^{-23}) \times 2^{127}) = (1 + (1 - 2^{-23}) \times 2^{127})\)
\item[Largest positive denorm value:] exp = 0; significand = all 1s
\((-1)^0 \times 0.1111...1111 \times 2^{0 - 127 + 1} = (1 - 2^{-23}) \times 2^{-126}\)
\end{description}
\section{Floating Point Step Size}
Let $x$ be the biased exponent and $y$ be the significand.
\begin{itemize}
\item Current number = \((1+y) \times 2^{x - 127}\)
\item Next number = \((1+y + 2^{-23}) \times 2^{x - 127}\)
\item Step size = next\_num - curr\_num = \(2^{x-150}\)
\end{itemize}
Notice there are all \(2^{23} ~ 8 \text{ million}\) possible bit patterns with \emph{equal} spacing between neighboring powers of 2. The step increases by a factor of 2 for every time the exponent increases by 1.
\begin{description}
\item[Denorm Step Size:] Equivalent for all denorm values since they all have the same exponent.
\begin{itemize}
\item Current number = \(y \times 2^{-126}\)
\item Next number = \((y + 2^{-23}) \times 2^{-126}\)
\item Step size = \(2^{-149}\)
\end{itemize}
\end{description}
\section{Floating Point Caveats}
\begin{description}
\item[Non-associative:] FP \emph{approximates} the real result.
\begin{itemize}
\item e.g. \(x = -1.5 \times 10^{38}\), \(y = 1.5 \times 10^{38}\), \(z = 1.0\)
\item \(x + (y + z) = -1.5 \times 10^{38} + (1.5 \times 10^{38} + 1.0) = -1.5 \times 10^{38} + 1.5 \times 10^{38} = 0.0\)
\item \((x + y) + z = (-1.5 \times 10^{38} + 1.5 \times 10^{38}) + 1.0 = 0.0 + 1.0 = 1.0\)
\end{itemize}
\item[Precision vs. Accuracy:] \emph{Precision} is the count of bits used to represent a value; \emph{accuracy} is the different between the actual value and its computer representation.
\item[Rounding:] May occur during calculation and conversion/casting.
\end{description}